In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
from PIL import Image
import time

2025-05-08 00:13:51.809310: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-08 00:13:51.835058: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-08 00:13:51.835088: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-08 00:13:51.835990: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-08 00:13:51.840121: I tensorflow/core/platform/cpu_feature_guar

In [2]:
seed = 100
np.random.seed(seed)
tf.random.set_seed(seed)

## Defining the BasicLSTM, GRU and MGU Architectures

In [3]:

# RNN Cell implementations
class BasicLSTM_cell(object):
    def __init__(self, input_units, hidden_units, output_units):
        # Initialize weights and biases
        self.input_units = input_units
        self.hidden_units = hidden_units
        self.output_units = output_units
        
        # Input gate weights
        self.Wi = tf.Variable(tf.zeros([self.input_units, self.hidden_units]))
        self.Ui = tf.Variable(tf.zeros([self.hidden_units, self.hidden_units]))
        self.bi = tf.Variable(tf.zeros([self.hidden_units]))
        
        # Forget gate weights
        self.Wf = tf.Variable(tf.zeros([self.input_units, self.hidden_units]))
        self.Uf = tf.Variable(tf.zeros([self.hidden_units, self.hidden_units]))
        self.bf = tf.Variable(tf.zeros([self.hidden_units]))
        
        # Output gate weights
        self.Woutg = tf.Variable(tf.zeros([self.input_units, self.hidden_units]))
        self.Uoutg = tf.Variable(tf.zeros([self.hidden_units, self.hidden_units]))
        self.boutg = tf.Variable(tf.zeros([self.hidden_units]))
        
        # Cell state weights
        self.Wc = tf.Variable(tf.zeros([self.input_units, self.hidden_units]))
        self.Uc = tf.Variable(tf.zeros([self.hidden_units, self.hidden_units]))
        self.bc = tf.Variable(tf.zeros([self.hidden_units]))
        
        # Output layer weights
        self.Wo = tf.Variable(tf.random.truncated_normal([self.hidden_units, self.output_units], mean=0, stddev=.02))
        self.bo = tf.Variable(tf.random.truncated_normal([self.output_units], mean=0, stddev=.02))
    
    def initialize_state(self, batch_size):
        # Initialize hidden state and cell state with zeros
        h_init = tf.zeros([batch_size, self.hidden_units])
        c_init = tf.zeros([batch_size, self.hidden_units])
        return tf.stack([h_init, c_init])
    
    def Lstm(self, previous_hidden_memory, x):
        # Unstack previous hidden state and cell state
        previous_hidden_state, c_prev = tf.unstack(previous_hidden_memory)
        
        # Input gate
        i = tf.sigmoid(tf.matmul(x, self.Wi) + 
                      tf.matmul(previous_hidden_state, self.Ui) + self.bi)
        
        # Forget gate
        f = tf.sigmoid(tf.matmul(x, self.Wf) + 
                      tf.matmul(previous_hidden_state, self.Uf) + self.bf)
        
        # Output gate
        o = tf.sigmoid(tf.matmul(x, self.Woutg) + 
                      tf.matmul(previous_hidden_state, self.Uoutg) + self.boutg)
        
        # New cell state candidate
        c_ = tf.tanh(tf.matmul(x, self.Wc) + 
                    tf.matmul(previous_hidden_state, self.Uc) + self.bc)
        
        # Update cell state
        c = f * c_prev + i * c_
        
        # Update hidden state
        current_hidden_state = o * tf.tanh(c)
        
        return tf.stack([current_hidden_state, c])
    
    def process_sequence(self, inputs, initial_state=None):
        # Process a sequence of inputs
        batch_size = tf.shape(inputs)[0]
        
        # Initialize state if not provided
        if initial_state is None:
            initial_state = self.initialize_state(batch_size)
        
        # Transpose inputs to time-major format
        inputs_time_major = tf.transpose(inputs, perm=[1, 0, 2])
        
        # Scan through time steps
        all_states = tf.scan(self.Lstm, inputs_time_major, initializer=initial_state)
        
        # Extract hidden states
        all_hidden_states = all_states[:, 0, :, :]
        
        # Apply output layer to each hidden state
        all_outputs = tf.map_fn(lambda h: tf.nn.relu(tf.matmul(h, self.Wo) + self.bo), 
                               all_hidden_states)
        
        # Convert back to batch-major format
        return tf.transpose(all_outputs, perm=[1, 0, 2])


class GRU_cell(object):
    def __init__(self, input_units, hidden_units, output_units):
        self.input_units = input_units
        self.hidden_units = hidden_units
        self.output_units = output_units
        
        # Update gate weights
        self.Wz = tf.Variable(tf.zeros([self.input_units, self.hidden_units]))
        self.Uz = tf.Variable(tf.zeros([self.hidden_units, self.hidden_units])) 
        self.bz = tf.Variable(tf.zeros([self.hidden_units]))
        
        # Reset gate weights
        self.Wr = tf.Variable(tf.zeros([self.input_units, self.hidden_units]))
        self.Ur = tf.Variable(tf.zeros([self.hidden_units, self.hidden_units]))
        self.br = tf.Variable(tf.zeros([self.hidden_units]))
        
        # Candidate state weights
        self.Ws = tf.Variable(tf.zeros([self.input_units, self.hidden_units]))
        self.Us = tf.Variable(tf.zeros([self.hidden_units, self.hidden_units]))
        self.bs = tf.Variable(tf.zeros([self.hidden_units]))
        
        # Output layer weights
        self.Wo = tf.Variable(tf.random.truncated_normal([self.hidden_units, self.output_units], mean=0, stddev=.02))
        self.bo = tf.Variable(tf.random.truncated_normal([self.output_units], mean=0, stddev=.02))
    
    def initialize_state(self, batch_size):
        # Initialize hidden state with zeros
        return tf.zeros([batch_size, self.hidden_units])
    
    def Gru(self, previous_state, x):
        # Update gate
        z = tf.sigmoid(tf.matmul(x, self.Wz) + 
                      tf.matmul(previous_state, self.Uz) + self.bz)
        
        # Reset gate
        r = tf.sigmoid(tf.matmul(x, self.Wr) + 
                      tf.matmul(previous_state, self.Ur) + self.br)
        
        # Candidate state
        s_tilde = tf.tanh(tf.matmul(x, self.Ws) + 
                         tf.matmul(r * previous_state, self.Us) + self.bs)
        
        # Update state
        current_state = (1 - z) * previous_state + z * s_tilde
        
        return current_state
    
    def process_sequence(self, inputs, initial_state=None):
        # Process a sequence of inputs
        batch_size = tf.shape(inputs)[0]
        
        # Initialize state if not provided
        if initial_state is None:
            initial_state = self.initialize_state(batch_size)
        
        # Transpose inputs to time-major format
        inputs_time_major = tf.transpose(inputs, perm=[1, 0, 2])
        
        # Scan through time steps
        all_states = tf.scan(self.Gru, inputs_time_major, initializer=initial_state)
        
        # Apply output layer to each hidden state
        all_outputs = tf.map_fn(lambda h: tf.nn.relu(tf.matmul(h, self.Wo) + self.bo), 
                               all_states)
        
        # Convert back to batch-major format
        return tf.transpose(all_outputs, perm=[1, 0, 2])


class MGU_cell(object):
    def __init__(self, input_units, hidden_units, output_units):
        self.input_units = input_units
        self.hidden_units = hidden_units
        self.output_units = output_units
        
        # Forget/update gate weights
        self.Wf = tf.Variable(tf.zeros([self.input_units, self.hidden_units]))
        self.Uf = tf.Variable(tf.zeros([self.hidden_units, self.hidden_units]))
        self.bf = tf.Variable(tf.zeros([self.hidden_units]))
        
        # Candidate state weights
        self.Ws = tf.Variable(tf.zeros([self.input_units, self.hidden_units]))
        self.Us = tf.Variable(tf.zeros([self.hidden_units, self.hidden_units]))
        self.bs = tf.Variable(tf.zeros([self.hidden_units]))
        
        # Output layer weights
        self.Wo = tf.Variable(tf.random.truncated_normal([self.hidden_units, self.output_units], mean=0, stddev=.02))
        self.bo = tf.Variable(tf.random.truncated_normal([self.output_units], mean=0, stddev=.02))
    
    def initialize_state(self, batch_size):
        # Initialize hidden state with zeros
        return tf.zeros([batch_size, self.hidden_units])
    
    def Mgu(self, previous_state, x):
        # Forget gate (equivalent to update gate)
        f = tf.sigmoid(tf.matmul(x, self.Wf) + 
                     tf.matmul(previous_state, self.Uf) + self.bf)
        
        # Candidate state
        s_tilde = tf.tanh(tf.matmul(x, self.Ws) + 
                        tf.matmul(f * previous_state, self.Us) + self.bs)
        
        # Update state
        current_state = (1 - f) * previous_state + f * s_tilde
        
        return current_state
    
    def process_sequence(self, inputs, initial_state=None):
        # Process a sequence of inputs
        batch_size = tf.shape(inputs)[0]
        
        # Initialize state if not provided
        if initial_state is None:
            initial_state = self.initialize_state(batch_size)
        
        # Transpose inputs to time-major format
        inputs_time_major = tf.transpose(inputs, perm=[1, 0, 2])
        
        # Scan through time steps
        all_states = tf.scan(self.Mgu, inputs_time_major, initializer=initial_state)
        
        # Apply output layer to each hidden state
        all_outputs = tf.map_fn(lambda h: tf.nn.relu(tf.matmul(h, self.Wo) + self.bo), 
                               all_states)
        
        # Convert back to batch-major format
        return tf.transpose(all_outputs, perm=[1, 0, 2])




## Defining the data loading function

In [4]:
# Data loading and preprocessing functions
def load_notmnist_dataset(data_dir):
    # List all subdirectories (each represents a class)
    class_dirs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    class_dirs.sort()  # Ensure consistent ordering
    
    images = []
    labels = []
    
    # Load images from each class directory
    for class_idx, class_dir in enumerate(class_dirs):
        class_path = os.path.join(data_dir, class_dir)
        file_list = glob.glob(os.path.join(class_path, '*.png')) + glob.glob(os.path.join(class_path, '*.jpg'))
        
        for file_path in file_list:
            try:
                # Load image as grayscale
                img = Image.open(file_path).convert('L')
                img = img.resize((28, 28))  # Resize to consistent dimensions
                img_array = np.array(img, dtype=np.float32) / 255.0  # Normalize to [0, 1]
                
                images.append(img_array)
                labels.append(class_idx)
            except:
                # Skip corrupted images
                print(f"Error loading {file_path}, skipping...")
    
    # Convert to numpy arrays
    images = np.array(images)
    labels = np.array(labels)
    
    # Shuffle the data
    indices = np.arange(len(images))
    np.random.shuffle(indices)
    images = images[indices]
    labels = labels[indices]
    
    # Split into train and test sets (80% train, 20% test)
    split_idx = int(len(images) * 0.8)
    x_train, x_test = images[:split_idx], images[split_idx:]
    y_train, y_test = labels[:split_idx], labels[split_idx:]
    
    return (x_train, y_train), (x_test, y_test)


## Training Function

In [5]:
# Training function
def train_rnn_model(rnn_cell, dataset, epochs=10, batch_size=128, learning_rate=0.001):
    # Split dataset into training and testing
    (x_train, y_train), (x_test, y_test) = dataset
    
    # Reshape input for RNN: we'll treat each row of the image as a time step
    # So for a 28x28 image, we have 28 time steps of 28 features each
    x_train = x_train.reshape(-1, 28, 28)
    x_test = x_test.reshape(-1, 28, 28)
    
    # Convert to TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=10000).batch(batch_size)
    
    test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
    test_dataset = test_dataset.batch(batch_size)
    
    # Define optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    # Define loss function
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    # Training history
    history = {
        'train_loss': [],
        'train_accuracy': [],
        'test_loss': [],
        'test_accuracy': [],
        'time':[]
    }
    
    # Get all trainable variables
    def get_trainable_vars(cell):
        return [var for var in cell.__dict__.values() if isinstance(var, tf.Variable)]
        
    start_time =time.time()
    end_time = 0
    
    trainable_vars = get_trainable_vars(rnn_cell)
    
    # Training loop
    for epoch in range(epochs):
        # Training metrics for this epoch
        train_loss_total = 0.0
        train_accuracy_total = 0.0
        train_batch_count = 0
        
        # Test metrics for this epoch
        test_loss_total = 0.0
        test_accuracy_total = 0.0
        test_batch_count = 0
        
        # Training
        for x_batch, y_batch in train_dataset:
            with tf.GradientTape() as tape:
                # Forward pass
                predictions = rnn_cell.process_sequence(x_batch)
                # Use the last output for classification
                logits = predictions[:, -1, :]
                loss = loss_fn(y_batch, logits)
            
            # Calculate gradients
            gradients = tape.gradient(loss, trainable_vars)
            
            # Apply gradients
            optimizer.apply_gradients(zip(gradients, trainable_vars))
            
            # Update metrics
            train_loss_total += loss.numpy()
            train_accuracy_total += tf.reduce_mean(
                tf.cast(tf.equal(tf.argmax(logits, axis=1), y_batch), tf.float32)
            ).numpy()
            train_batch_count += 1
        
        # Calculate average training metrics
        avg_train_loss = train_loss_total / train_batch_count
        avg_train_accuracy = train_accuracy_total / train_batch_count

        
        end_time = time.time()

        
        # Testing
        for x_batch, y_batch in test_dataset:
            # Forward pass
            predictions = rnn_cell.process_sequence(x_batch)
            # Use the last output for classification
            logits = predictions[:, -1, :]
            loss = loss_fn(y_batch, logits)
            
            # Update metrics
            test_loss_total += loss.numpy()
            test_accuracy_total += tf.reduce_mean(
                tf.cast(tf.equal(tf.argmax(logits, axis=1), y_batch), tf.float32)
            ).numpy()
            test_batch_count += 1
        
        # Calculate average test metrics
        avg_test_loss = test_loss_total / test_batch_count
        avg_test_accuracy = test_accuracy_total / test_batch_count
        
        # Record history
        history['train_loss'].append(avg_train_loss)
        history['train_accuracy'].append(avg_train_accuracy)
        history['test_loss'].append(avg_test_loss)
        history['test_accuracy'].append(avg_test_accuracy)
        
        # Print epoch results
        print(f'Epoch {epoch + 1}/{epochs}, '
              f'Loss: {avg_train_loss:.4f}, '
              f'Accuracy: {avg_train_accuracy:.4f}, '
              f'Test Loss: {avg_test_loss:.4f}, '
              f'Test Accuracy: {avg_test_accuracy:.4f}')
    history['time'].append(end_time-start_time)
    return history


## Running all the trials

In [6]:
# Run experiments
def run_experiments(data_dir, num_trials=3, hidden_units=64, epochs=5):
    results = {
        'rnn': [],
        'gru': [],
        'mgu': []
    }
    
    input_units = 28  
    output_units = 10  # since notMNIST has 10 classes (A to J)
    
    # Summary table for classification errors
    error_summary = {
        'rnn_train_error': [],
        'rnn_test_error': [],
        'rnn_time': [],
        'gru_train_error': [],
        'gru_test_error': [],
        'gru_time':[],
        'mgu_train_error': [],
        'mgu_test_error': [],
        'mgu_time':[]
    }
    
    for trial in range(num_trials):
        print(f"Trial {trial + 1}/{num_trials}")
        
        # Load datase
        dataset = load_notmnist_dataset(data_dir)
        
        # Train Basic RNN
        print("Training Basic RNN...")
        rnn_cell = BasicLSTM_cell(input_units, hidden_units, output_units)
        rnn_history = train_rnn_model(rnn_cell, dataset, epochs=epochs)
        results['rnn'].append(rnn_history)
        
        # Calculating final errors for RNN
        final_rnn_train_error = 1.0 - rnn_history['train_accuracy'][-1]
        final_rnn_test_error = 1.0 - rnn_history['test_accuracy'][-1]
        final_rnn_time = rnn_history['time'][-1]
        error_summary['rnn_train_error'].append(final_rnn_train_error)
        error_summary['rnn_test_error'].append(final_rnn_test_error)
        error_summary['rnn_time'].append(final_rnn_time)
        
        # Train GRU
        print("Training GRU...")
        gru_cell = GRU_cell(input_units, hidden_units, output_units)
        gru_history = train_rnn_model(gru_cell, dataset, epochs=epochs)
        results['gru'].append(gru_history)
        
        # Calculate final errors for GRU
        final_gru_train_error = 1.0 - gru_history['train_accuracy'][-1]
        final_gru_test_error = 1.0 - gru_history['test_accuracy'][-1]
        final_gru_time = gru_history['time'][-1]
        error_summary['gru_train_error'].append(final_gru_train_error)
        error_summary['gru_test_error'].append(final_gru_test_error)
        error_summary['gru_time'].append(final_gru_time)
        
        # Train MGU
        print("Training MGU...")
        mgu_cell = MGU_cell(input_units, hidden_units, output_units)
        mgu_history = train_rnn_model(mgu_cell, dataset, epochs=epochs)
        results['mgu'].append(mgu_history)
        
        # Calculating final errors for MGU
        final_mgu_train_error = 1.0 - mgu_history['train_accuracy'][-1]
        final_mgu_test_error = 1.0 - mgu_history['test_accuracy'][-1]
        final_mgu_time = mgu_history['time'][-1]
        error_summary['mgu_train_error'].append(final_mgu_train_error)
        error_summary['mgu_test_error'].append(final_mgu_test_error)
        error_summary['mgu_time'].append(final_mgu_time)
        
        # Reporting errors for this trial
        print(f"\nClassification Error Report for Trial {trial + 1}:")
        print(f"RNN - Training Error: {final_rnn_train_error:.4f}, Test Error: {final_rnn_test_error:.4f}, time: {final_rnn_time:.4f} seconds")
        print(f"GRU - Training Error: {final_gru_train_error:.4f}, Test Error: {final_gru_test_error:.4f}, time: {final_gru_time:.4f} seconds")
        print(f"MGU - Training Error: {final_mgu_train_error:.4f}, Test Error: {final_mgu_test_error:.4f}, time: {final_mgu_time:.4f} seconds")
        print("-" * 50)
    
    # Calculate and report average errors across all trials
    print("\nSummary of Classification Errors Across All Trials:")
    print(f"RNN - Avg Training Error: {np.mean(error_summary['rnn_train_error']):.4f} ± {np.std(error_summary['rnn_train_error']):.4f}")
    print(f"RNN - Avg Test Error: {np.mean(error_summary['rnn_test_error']):.4f} ± {np.std(error_summary['rnn_test_error']):.4f}")
    print(f"RNN - Avg Time: {np.mean(error_summary['rnn_time']):.4f} ± {np.std(error_summary['rnn_time']):.4f} seconds\n")
    
    print(f"GRU - Avg Training Error: {np.mean(error_summary['gru_train_error']):.4f} ± {np.std(error_summary['gru_train_error']):.4f}")
    print(f"GRU - Avg Test Error: {np.mean(error_summary['gru_test_error']):.4f} ± {np.std(error_summary['gru_test_error']):.4f}")
    print(f"GRU - Avg Time: {np.mean(error_summary['gru_time']):.4f} ± {np.std(error_summary['gru_time']):.4f} seconds\n")
    
    print(f"MGU - Avg Training Error: {np.mean(error_summary['mgu_train_error']):.4f} ± {np.std(error_summary['mgu_train_error']):.4f}")
    print(f"MGU - Avg Test Error: {np.mean(error_summary['mgu_test_error']):.4f} ± {np.std(error_summary['mgu_test_error']):.4f}")
    print(f"MGU - Avg Time: {np.mean(error_summary['mgu_time']):.4f} ± {np.std(error_summary['mgu_time']):.4f} seconds")
    
    return results, error_summary

   

## Plotting Function

In [7]:
# Plot results
def plot_results(results):
    models = ['rnn', 'gru', 'mgu']
    metrics = ['train_accuracy', 'test_accuracy']
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    for i, metric in enumerate(metrics):
        ax = axes[i]
        
        for model in models:
            # Calculate mean and std across trials
            trials_data = [trial[metric] for trial in results[model]]
            mean_data = np.mean(trials_data, axis=0)
            std_data = np.std(trials_data, axis=0)
            
            # Plot mean with std as shaded area
            epochs = range(1, len(mean_data) + 1)
            ax.plot(epochs, mean_data, label=model.upper())
            ax.fill_between(epochs, mean_data - std_data, mean_data + std_data, alpha=0.2)
        
        ax.set_title(f'{metric.replace("_", " ").title()}')
        ax.set_xlabel('Epochs')
        ax.set_ylabel('Accuracy')
        ax.legend()
        ax.grid(True)
    
    plt.tight_layout()
    plt.savefig('rnn_comparison.png')
    plt.show()


## Running the experiment across the three trials

In [8]:

def main():
    
    data_dir = r"./notMNIST_small"
    
    # Run experiments
    results, error_summary = run_experiments(data_dir, num_trials=3, hidden_units=64, epochs=5)
    
    # Plot results
    plot_results(results)
    
    
    plt.figure(figsize=(12, 6))
    
    # Data for the bar chart
    models = ['RNN', 'GRU', 'MGU']
    train_errors = [
        np.mean(error_summary['rnn_train_error']), 
        np.mean(error_summary['gru_train_error']), 
        np.mean(error_summary['mgu_train_error'])
    ]
    test_errors = [
        np.mean(error_summary['rnn_test_error']), 
        np.mean(error_summary['gru_test_error']), 
        np.mean(error_summary['mgu_test_error'])
    ]
    train_std = [
        np.std(error_summary['rnn_train_error']), 
        np.std(error_summary['gru_train_error']), 
        np.std(error_summary['mgu_train_error'])
    ]
    test_std = [
        np.std(error_summary['rnn_test_error']), 
        np.std(error_summary['gru_test_error']), 
        np.std(error_summary['mgu_test_error'])
    ]
    train_n_test_times = [
        np.std(error_summary['rnn_time']), 
        np.std(error_summary['gru_time']), 
        np.std(error_summary['mgu_time'])
    ]
    
    # Positioning for bars
    x = np.arange(len(models))
    width = 0.35
    
    # Create bars
    plt.bar(x - width/2, train_errors, width, label='Training Error', yerr=train_std, capsize=5)
    plt.bar(x + width/2, test_errors, width, label='Test Error', yerr=test_std, capsize=5)
    
    
    plt.xlabel('Model')
    plt.ylabel('Classification Error')
    plt.title('Classification Error Comparison: RNN vs GRU vs MGU')
    plt.xticks(x, models)
    plt.legend()
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)

   
    # Adding value labels on top of the bars
    for i, v in enumerate(train_errors):
        plt.text(i - width/2, v + 0.01, f'{v:.4f}', ha='center')
    for i, v in enumerate(test_errors):
        plt.text(i + width/2, v + 0.01, f'{v:.4f}', ha='center')
    
    plt.tight_layout()
    plt.savefig('classification_error_comparison.png')
    plt.show()
    plt.close()

    # Creating the bar chart for the average time it took each model across the three trials
    plt.bar(x, train_n_test_times) 
    plt.xlabel('Model')
    plt.ylabel('Classification Time')
    plt.title('Classification Time Comparison: RNN vs GRU vs MGU')
    plt.xticks(x, models)
    plt.legend()
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)


In [None]:
if __name__ == "__main__":
    main()

Trial 1/3
Error loading ./notMNIST_small/A/RGVtb2NyYXRpY2FCb2xkT2xkc3R5bGUgQm9sZC50dGY=.png, skipping...
Error loading ./notMNIST_small/F/Q3Jvc3NvdmVyIEJvbGRPYmxpcXVlLnR0Zg==.png, skipping...
Training Basic RNN...


2025-05-08 00:15:46.690742: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-05-08 00:15:46.707027: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-05-08 00:15:46.707064: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-05-08 00:15:46.711065: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-05-08 00:15:46.711096: I external/local_xla/xla/stream_executor

Epoch 1/5, Loss: 2.0269, Accuracy: 0.2822, Test Loss: 1.6832, Test Accuracy: 0.4341
Epoch 2/5, Loss: 1.3074, Accuracy: 0.6118, Test Loss: 0.9238, Test Accuracy: 0.7326
Epoch 3/5, Loss: 0.6880, Accuracy: 0.8062, Test Loss: 0.5678, Test Accuracy: 0.8364
Epoch 4/5, Loss: 0.5021, Accuracy: 0.8623, Test Loss: 0.4723, Test Accuracy: 0.8692
Epoch 5/5, Loss: 0.4220, Accuracy: 0.8803, Test Loss: 0.4497, Test Accuracy: 0.8726
Training GRU...
