# Verify Correctness of GPTQ-triton

This notebook verifies the correctness of the Triton kernels and other modifications.

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import itertools

import original_quant
import gptq_triton
import torch
import torch.nn as nn
from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaConfig
import gptq
from quantize import dumbquant, pack_linear

## Verify QuantLinear

In [2]:
# QuantLinear is compared against a reference and the CUDA kernel at various values of M, N and K
# The reference is an FP16 simulation of the quantized weights
torch.manual_seed(0)
print("groupsize |   M   |   N   |   K   | cuda - ref | triton - ref | triton - cuda |")

for (groupsize, M, N, K) in itertools.product([-1, 128], [1, 8, 100, 256, 2048], [4096, 11008], [4096, 11008]):
	M = M # B * seq_len
	K = K # Input dimension
	N = N # Output dimension

	layer = nn.Linear(K, N, bias=False)  # Llama doesn't use bias
	vec = torch.randn(1, M, K, device='cuda', dtype=torch.float16)

	scales, zeros = dumbquant(layer, 4, groupsize=groupsize)

	cudalayer = original_quant.QuantLinear(4, groupsize, layer.in_features, layer.out_features)
	cudalayer.pack(layer, scales.clone(), zeros.clone())

	tritonlayer = gptq_triton.QuantLinear(4, groupsize, layer.in_features, layer.out_features, bias=False)
	pack_linear(tritonlayer, layer.weight.data, scales, zeros, None)

	layer = layer.half()

	layer = layer.to('cuda')
	cudalayer = cudalayer.to('cuda')
	tritonlayer = tritonlayer.to('cuda')

	ref = layer(vec)
	cuda_out = cudalayer(vec)
	triton_out = tritonlayer(vec)

	# Print results
	print(f'    {groupsize:5d}', end=' | ')
	print(f'{M:5d}', end=' | ')
	print(f'{N:5d}', end=' | ')
	print(f'{K:5d}', end=' | ')
	print(f'  {(cuda_out - ref).abs().max():.6f}', end=' | ')
	print(f'    {(triton_out - ref).abs().max():.6f}', end=' | ')
	print(f'     {(triton_out - cuda_out).abs().max():.6f}', end=' | ')

	if (triton_out - ref).abs().max() > 0.004 or (triton_out - cuda_out).abs().max() > 0.004:
		print(" !!! WARNING: Error is too large !!! ")
	else:
		print()

groupsize |   M   |   N   |   K   | cuda - ref | triton - ref | triton - cuda |
       -1 |     1 |  4096 |  4096 |   0.000977 |     0.001953 |      0.001953 | 
       -1 |     1 |  4096 | 11008 |   0.000977 |     0.001953 |      0.001953 | 
       -1 |     1 | 11008 |  4096 |   0.001953 |     0.001953 |      0.001953 | 
       -1 |     1 | 11008 | 11008 |   0.000977 |     0.001953 |      0.000977 | 
       -1 |     8 |  4096 |  4096 |   0.001953 |     0.001953 |      0.001953 | 
       -1 |     8 |  4096 | 11008 |   0.001953 |     0.001953 |      0.001953 | 
       -1 |     8 | 11008 |  4096 |   0.001953 |     0.001953 |      0.001953 | 
       -1 |     8 | 11008 | 11008 |   0.001953 |     0.001953 |      0.001953 | 
       -1 |   100 |  4096 |  4096 |   0.001953 |     0.001953 |      0.001953 | 
       -1 |   100 |  4096 | 11008 |   0.001953 |     0.001953 |      0.001953 | 
       -1 |   100 | 11008 |  4096 |   0.001953 |     0.001953 |      0.001953 | 
       -1 |   100 | 11008 | 1

## Verify QKV Fusion

In [3]:
# Comparison to ensure that the QKV fusion is correct
class TestModel(nn.Module):
	def __init__(self):
		super().__init__()
		self.attn = LlamaAttention(LlamaConfig(hidden_size=4096, num_attention_heads=32))

	def forward(self, x):
		return self.attn(x,)

model = TestModel()

# Quantize the model
for name, m in model.named_modules():
	if not isinstance(m, nn.Linear):
		continue

	scales, zeros = dumbquant(m, 4, groupsize=-1)
	triton_layer = gptq_triton.QuantLinear(4, -1, m.in_features, m.out_features, bias=False)
	pack_linear(triton_layer, m.weight.data, scales, zeros, None)

	# Replace in model
	parent_name = name.rsplit('.', 1)[0]
	parent = model.get_submodule(parent_name)

	setattr(parent, name[len(parent_name) + 1:], triton_layer)

# Save the original attention layer
original_attn = model.attn

# Fuse
gptq_triton.make_quant_attn(model)
fused_attn = model.attn

# Move to CUDA
original_attn.to('cuda')
fused_attn.to('cuda')

# Compare
for M in [1, 8, 100, 256, 2048]:
	x = torch.randn(1, M, 4096, device='cuda', dtype=torch.float16)
	position_ids = torch.arange(0, M, dtype=torch.long, device='cuda')
	position_ids = position_ids.unsqueeze(0).view(-1, M)

	original_out = original_attn(x, position_ids=position_ids)[0]
	fused_out = fused_attn(x, position_ids=position_ids)[0]

	diff = (original_out - fused_out).abs().max()
	print(f"Max diff: {diff}")

	# Assertions
	assert isinstance(fused_attn, gptq_triton.QuantLlamaAttention)
	assert diff == 0

Max diff: 0.0
Max diff: 0.0
Max diff: 0.0
Max diff: 0.0
Max diff: 0.0


## Verify Fused MLP

In [4]:
layer = LlamaMLP(4096, 11008, 'silu')
layer = layer.half()
layer_g128 = LlamaMLP(4096, 11008, 'silu')
layer_g128 = layer_g128.half()
layer_g128.load_state_dict(layer.state_dict())

# Quantize
for name, m in layer.named_modules():
	if not isinstance(m, nn.Linear):
		continue

	scales, zeros = dumbquant(m, 4, groupsize=-1)
	triton_layer = gptq_triton.QuantLinear(4, -1, m.in_features, m.out_features, bias=False)
	pack_linear(triton_layer, m.weight.data, scales, zeros, None)

	setattr(layer, name, triton_layer)

for name, m in layer_g128.named_modules():
	if not isinstance(m, nn.Linear):
		continue

	scales, zeros = dumbquant(m, 4, groupsize=128)
	triton_layer = gptq_triton.QuantLinear(4, 128, m.in_features, m.out_features, bias=False)
	pack_linear(triton_layer, m.weight.data, scales, zeros, None)

	setattr(layer_g128, name, triton_layer)

# Fuse
fused_layer = gptq_triton.make_fused_mlp(layer)
fused_layer_g128 = gptq_triton.make_fused_mlp(layer_g128)
assert isinstance(fused_layer, gptq_triton.QuantLlamaMLP) and isinstance(fused_layer_g128, gptq_triton.QuantLlamaMLP)

# Move to CUDA
layer.to('cuda')
layer_g128.to('cuda')
fused_layer.to('cuda')
fused_layer_g128.to('cuda')

# Compare
for M in [1, 8, 100, 256, 2048]:
	x = torch.randn(1, M, 4096, device='cuda', dtype=torch.float16)

	original_out = layer(x)
	fused_out = fused_layer(x)

	diff = (original_out - fused_out).abs().max()
	print(f"Max diff: {diff}")

	# There is a small difference because the fused MLP performs some calculations in float32, while the original MLP performs them in float16
	assert diff < 1e-3

	original_out = layer_g128(x)
	fused_out = fused_layer_g128(x)
	diff = (original_out - fused_out).abs().max()
	print(f"Max diff: {diff}")
	assert diff < 1e-3

Max diff: 0.000244140625
Max diff: 0.000244140625
Max diff: 0.000244140625
Max diff: 0.000244140625
Max diff: 0.000244140625
Max diff: 0.000244140625
Max diff: 0.0003662109375
Max diff: 0.00025177001953125
Max diff: 0.00048828125
Max diff: 0.000274658203125
