<a href="https://colab.research.google.com/github/foxtrotmike/CS909/blob/master/brevitas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install brevitas

Collecting brevitas
  Downloading brevitas-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Collecting dependencies==2.0.1 (from brevitas)
  Downloading dependencies-2.0.1-py2.py3-none-any.whl.metadata (3.9 kB)
Collecting setuptools<70.0 (from brevitas)
  Downloading setuptools-69.5.1-py3-none-any.whl.metadata (6.2 kB)
Collecting unfoldNd (from brevitas)
  Downloading unfoldNd-0.2.3-py3-none-any.whl.metadata (1.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9.1->brevitas)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9.1->brevitas)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.9.1->brevitas)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.9.1->brevita

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import brevitas.nn as qnn
from brevitas.quant import Int32Bias

# XOR dataset
X = torch.tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]], dtype=torch.float32)
Y = torch.tensor([0, 1, 1, 0], dtype=torch.long)  # Classification labels

# Create a DataLoader for the XOR dataset
dataset = TensorDataset(X, Y)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

# Standard MLP Model (32-bit precision)
class StandardXORNet(nn.Module):
    def __init__(self):
        super(StandardXORNet, self).__init__()
        self.fc1 = nn.Linear(2, 10, bias=True)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(10, 10, bias=True)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(10, 2, bias=True)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        x = self.softmax(x)
        return x

# Quantized MLP Model using Brevitas (2-bit precision)
class QuantizedXORNet(nn.Module):
    def __init__(self):
        super(QuantizedXORNet, self).__init__()

        # Input quantization
        self.quant_inp = qnn.QuantIdentity(bit_width=2, return_quant_tensor=True)

        # Fully connected layers with quantization
        self.fc1 = qnn.QuantLinear(2, 10, bias=True, weight_bit_width=2, bias_quant=Int32Bias)
        self.relu1 = qnn.QuantReLU(bit_width=2, return_quant_tensor=True)

        self.fc2 = qnn.QuantLinear(10, 10, bias=True, weight_bit_width=2, bias_quant=Int32Bias)
        self.relu2 = qnn.QuantReLU(bit_width=2, return_quant_tensor=True)

        self.fc3 = qnn.QuantLinear(10, 2, bias=True, weight_bit_width=2, bias_quant=Int32Bias)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.quant_inp(x)
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        x = self.softmax(x)
        return x

# Function to calculate memory usage
def calculate_model_memory_usage(model: nn.Module, input_size=(1, 2), bit_width=32, quantized=False):
    memory_summary = {'parameters': 0, 'activations': 0, 'optimizer_states': 0}

    for param in model.parameters():
        param_memory = param.numel() * bit_width / 8
        memory_summary['parameters'] += param_memory

    model.eval()
    with torch.no_grad():
        dummy_input = torch.randn(input_size)
        hooks = []

        def activation_hook(module, input, output):
            if isinstance(output, torch.Tensor):
                memory_summary['activations'] += output.numel() * bit_width / 8

        for module in model.modules():
            if isinstance(module, (nn.ReLU, nn.Linear, qnn.QuantReLU, qnn.QuantLinear)):
                hooks.append(module.register_forward_hook(activation_hook))

        model(dummy_input)

        for hook in hooks:
            hook.remove()

    for param in model.parameters():
        if param.requires_grad:
            state_memory = param.numel() * 2 * 32 / 8
            memory_summary['optimizer_states'] += state_memory

    total_memory = sum(memory_summary.values())
    memory_summary['total_memory_bytes'] = total_memory
    memory_summary['total_memory_kb'] = total_memory / 1024
    memory_summary['total_memory_mb'] = total_memory / (1024 ** 2)

    return memory_summary

# Initialize both models
standard_model = StandardXORNet()
quantized_model = QuantizedXORNet()

# Optimizers and Loss Function
criterion = nn.NLLLoss()
standard_optimizer = optim.Adam(standard_model.parameters(), lr=0.01)
quantized_optimizer = optim.Adam(quantized_model.parameters(), lr=0.01)

# Predictions before training
def show_predictions(model, name, inputs, labels):
    model.eval()
    with torch.no_grad():
        outputs = model(inputs)
        predicted = torch.argmax(outputs, dim=1)
        print(f"\n{name} Predictions:")
        print("Input:\n", inputs.numpy())
        print("Predicted:\n", predicted.numpy())
        print("Actual:\n", labels.numpy())
        accuracy = (predicted == labels).float().mean().item() * 100
        print(f"Accuracy: {accuracy:.2f}%\n")

show_predictions(standard_model, "Standard Model (Before Training)", X, Y)
show_predictions(quantized_model, "Quantized Model (Before Training)", X, Y)

# Training Loop
epochs = 500
for epoch in range(epochs):
    for model, optimizer, loader in [(standard_model, standard_optimizer, train_loader),
                                     (quantized_model, quantized_optimizer, train_loader)]:
        model.train()
        for data, target in loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

# Predictions after training
show_predictions(standard_model, "Standard Model (After Training)", X, Y)
show_predictions(quantized_model, "Quantized Model (After Training)", X, Y)

# Memory Usage Comparison
standard_memory_usage = calculate_model_memory_usage(standard_model, bit_width=32, quantized=False)
quantized_memory_usage = calculate_model_memory_usage(quantized_model, bit_width=2, quantized=True)

print("\nMemory Usage Comparison:")
print("\nStandard Model (32-bit):")
for key, value in standard_memory_usage.items():
    print(f"{key}: {value:.2f}")

print("\nQuantized Model (2-bit, Brevitas):")
for key, value in quantized_memory_usage.items():
    print(f"{key}: {value:.2f}")



Standard Model (Before Training) Predictions:
Input:
 [[-1. -1.]
 [-1.  1.]
 [ 1. -1.]
 [ 1.  1.]]
Predicted:
 [0 0 0 0]
Actual:
 [0 1 1 0]
Accuracy: 50.00%


Quantized Model (Before Training) Predictions:
Input:
 [[-1. -1.]
 [-1.  1.]
 [ 1. -1.]
 [ 1.  1.]]
Predicted:
 [1 1 1 1]
Actual:
 [0 1 1 0]
Accuracy: 50.00%


Standard Model (After Training) Predictions:
Input:
 [[-1. -1.]
 [-1.  1.]
 [ 1. -1.]
 [ 1.  1.]]
Predicted:
 [0 1 1 0]
Actual:
 [0 1 1 0]
Accuracy: 100.00%


Quantized Model (After Training) Predictions:
Input:
 [[-1. -1.]
 [-1.  1.]
 [ 1. -1.]
 [ 1.  1.]]
Predicted:
 [0 1 1 0]
Actual:
 [0 1 1 0]
Accuracy: 100.00%


Memory Usage Comparison:

Standard Model (32-bit):
parameters: 648.00
activations: 168.00
optimizer_states: 1296.00
total_memory_bytes: 2112.00
total_memory_kb: 2.06
total_memory_mb: 0.00

Quantized Model (2-bit, Brevitas):
parameters: 41.25
activations: 10.50
optimizer_states: 1320.00
total_memory_bytes: 1371.75
total_memory_kb: 1.34
total_memory_mb: 0.00


In [None]:
import torch
import torch.nn as nn
import brevitas.nn as qnn
from brevitas.quant import Int32Bias

# Standard MLP Model (32-bit precision)
class StandardXORNet(nn.Module):
    def __init__(self):
        super(StandardXORNet, self).__init__()
        self.fc1 = nn.Linear(2, 10, bias=True)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(10, 10, bias=True)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(10, 2, bias=True)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        x = self.softmax(x)
        return x

# Quantized MLP Model using Brevitas (2-bit precision)
class QuantizedXORNet(nn.Module):
    def __init__(self):
        super(QuantizedXORNet, self).__init__()

        # Input quantization
        self.quant_inp = qnn.QuantIdentity(bit_width=2, return_quant_tensor=True)

        # Fully connected layers with quantization
        self.fc1 = qnn.QuantLinear(2, 10, bias=True, weight_bit_width=2, bias_quant=Int32Bias)
        self.relu1 = qnn.QuantReLU(bit_width=2, return_quant_tensor=True)

        self.fc2 = qnn.QuantLinear(10, 10, bias=True, weight_bit_width=2, bias_quant=Int32Bias)
        self.relu2 = qnn.QuantReLU(bit_width=2, return_quant_tensor=True)

        self.fc3 = qnn.QuantLinear(10, 2, bias=True, weight_bit_width=2, bias_quant=Int32Bias)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.quant_inp(x)
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        x = self.softmax(x)
        return x

# Function to measure actual memory usage
def measure_actual_memory_usage(model, model_name):
    total_memory = 0
    print(f"\nActual Memory Usage for {model_name}:")

    for name, param in model.named_parameters():
        param_memory = param.numel() * param.element_size()
        total_memory += param_memory
        print(f"  {name}: {param.numel()} elements * {param.element_size()} bytes = {param_memory:.2f} bytes")

    for name, buffer in model.named_buffers():
        buffer_memory = buffer.numel() * buffer.element_size()
        total_memory += buffer_memory
        print(f"  {name} (buffer): {buffer.numel()} elements * {buffer.element_size()} bytes = {buffer_memory:.2f} bytes")

    print(f"Total Model Memory Usage: {total_memory / 1024:.2f} KB ({total_memory / (1024 ** 2):.4f} MB)")
    return total_memory

# Initialize models
standard_model = StandardXORNet()
quantized_model = QuantizedXORNet()

# Measure memory usage of the models
standard_memory = measure_actual_memory_usage(standard_model, "Standard Model (32-bit)")
quantized_memory = measure_actual_memory_usage(quantized_model, "Quantized Model (2-bit)")

# Comparison of total memory usage
print("\nMemory Usage Comparison:")
print(f"Standard Model: {standard_memory / 1024:.2f} KB")
print(f"Quantized Model: {quantized_memory / 1024:.2f} KB")
print(f"Memory Reduction: {(1 - quantized_memory / standard_memory) * 100:.2f}%")



Actual Memory Usage for Standard Model (32-bit):
  fc1.weight: 20 elements * 4 bytes = 80.00 bytes
  fc1.bias: 10 elements * 4 bytes = 40.00 bytes
  fc2.weight: 100 elements * 4 bytes = 400.00 bytes
  fc2.bias: 10 elements * 4 bytes = 40.00 bytes
  fc3.weight: 20 elements * 4 bytes = 80.00 bytes
  fc3.bias: 2 elements * 4 bytes = 8.00 bytes
Total Model Memory Usage: 0.63 KB (0.0006 MB)

Actual Memory Usage for Quantized Model (2-bit):
  quant_inp.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value: 1 elements * 4 bytes = 4.00 bytes
  fc1.weight: 20 elements * 4 bytes = 80.00 bytes
  fc1.bias: 10 elements * 4 bytes = 40.00 bytes
  relu1.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value: 1 elements * 4 bytes = 4.00 bytes
  fc2.weight: 100 elements * 4 bytes = 400.00 bytes
  fc2.bias: 10 elements * 4 bytes = 40.00 bytes
  relu2.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value: 1 elements * 4 bytes = 4.00 bytes
  fc3.weight: 20

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.quantization import (
    QuantStub, DeQuantStub, prepare_qat, convert, get_default_qat_qconfig
)

# XOR dataset
X = torch.tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]], dtype=torch.float32)
Y = torch.tensor([0, 1, 1, 0], dtype=torch.long)  # Classification labels

# Create a DataLoader for the XOR dataset
dataset = TensorDataset(X, Y)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

# Simple MLP Model with Quantization Support
class QuantizedXORNet(nn.Module):
    def __init__(self):
        super(QuantizedXORNet, self).__init__()

        # Define a simple fully connected network
        self.fc1 = nn.Linear(2, 10)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(10, 10)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(10, 2)
        self.softmax = nn.LogSoftmax(dim=1)

        # Quantization Stubs
        self.quant = QuantStub()
        self.dequant = DeQuantStub()

    def forward(self, x):
        # Quantize input
        x = self.quant(x)

        # Forward pass through the network
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        x = self.softmax(x)

        # Dequantize output
        x = self.dequant(x)
        return x

# Initialize model, loss function, and optimizer
model = QuantizedXORNet()
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Prepare the model for Quantization-Aware Training (QAT)
model.qconfig = get_default_qat_qconfig('fbgemm')  # 'fbgemm' is recommended for x86 CPUs
prepare_qat(model, inplace=True)

# Training Loop
epochs = 500
for epoch in range(epochs):
    model.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

# Convert the model to a quantized version
model.eval()
model_int8 = convert(model.eval(), inplace=False)

# Function to show predictions
def show_predictions(model, model_name, inputs, labels):
    model.eval()
    with torch.no_grad():
        outputs = model(inputs)
        predicted = torch.argmax(outputs, dim=1)
        accuracy = (predicted == labels).float().mean().item() * 100

        print(f"\n{model_name} Predictions:")
        print("Input:\n", inputs.numpy())
        print("Predicted:\n", predicted.numpy())
        print("Actual:\n", labels.numpy())
        print(f"Accuracy: {accuracy:.2f}%\n")

# Show predictions before and after quantization
show_predictions(model, "Model Before Quantization", X, Y)
show_predictions(model_int8, "Model After Quantization", X, Y)

# Measure memory usage of the quantized model
def measure_model_memory(model, model_name):
    total_memory = 0
    print(f"\nMemory Usage for {model_name}:")

    for name, param in model.named_parameters():
        param_memory = param.numel() * param.element_size()
        total_memory += param_memory
        print(f"  {name}: {param.numel()} elements * {param.element_size()} bytes = {param_memory:.2f} bytes")

    for name, buffer in model.named_buffers():
        buffer_memory = buffer.numel() * buffer.element_size()
        total_memory += buffer_memory
        print(f"  {name} (buffer): {buffer.numel()} elements * {buffer.element_size()} bytes = {buffer_memory:.2f} bytes")

    print(f"Total Model Memory Usage: {total_memory / 1024:.2f} KB ({total_memory / (1024 ** 2):.4f} MB)")

# Measure memory usage before and after quantization
measure_model_memory(model, "Model Before Quantization")
measure_model_memory(model_int8, "Model After Quantization")





Model Before Quantization Predictions:
Input:
 [[-1. -1.]
 [-1.  1.]
 [ 1. -1.]
 [ 1.  1.]]
Predicted:
 [0 1 1 0]
Actual:
 [0 1 1 0]
Accuracy: 100.00%



NotImplementedError: Could not run 'aten::_log_softmax.out' with arguments from the 'QuantizedCPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::_log_softmax.out' is only available for these backends: [CPU, CUDA, Meta, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradHIP, AutogradXLA, AutogradMPS, AutogradIPU, AutogradXPU, AutogradHPU, AutogradVE, AutogradLazy, AutogradMTIA, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, AutogradMeta, AutogradNestedTensor, Tracer, AutocastCPU, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

CPU: registered at aten/src/ATen/RegisterCPU.cpp:30476 [kernel]
CUDA: registered at aten/src/ATen/RegisterCUDA.cpp:44679 [kernel]
Meta: registered at /dev/null:241 [kernel]
BackendSelect: fallthrough registered at ../aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:153 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:497 [backend fallback]
Functionalize: registered at aten/src/ATen/RegisterFunctionalization_3.cpp:26243 [kernel]
Named: registered at ../aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at ../aten/src/ATen/ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at ../aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at ../aten/src/ATen/ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: registered at ../torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp:5390 [kernel]
AutogradOther: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradCPU: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradCUDA: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradHIP: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradXLA: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradMPS: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradIPU: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradXPU: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradHPU: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradVE: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradLazy: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradMTIA: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradPrivateUse1: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradPrivateUse2: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradPrivateUse3: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradMeta: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
AutogradNestedTensor: registered at ../torch/csrc/autograd/generated/VariableType_0.cpp:17993 [autograd kernel]
Tracer: registered at ../torch/csrc/autograd/generated/TraceType_0.cpp:17004 [kernel]
AutocastCPU: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:321 [backend fallback]
AutocastXPU: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:463 [backend fallback]
AutocastMPS: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:209 [backend fallback]
AutocastCUDA: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at ../aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at ../aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at ../aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at ../aten/src/ATen/LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at ../aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at ../aten/src/ATen/functorch/TensorWrapper.cpp:207 [backend fallback]
PythonTLSSnapshot: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:161 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:493 [backend fallback]
PreDispatch: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:165 [backend fallback]
PythonDispatcher: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:157 [backend fallback]
