cast quant linear output to model dtype

chu-tianxiang · Aug 26, 2023 · b633191 · b633191
1 parent 3f501ac
commit b633191
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/vllm/model_executor/quantize.py b/vllm/model_executor/quantize.py
@@ -69,7 +69,7 @@ def __init__(
                              outfeatures // world_size, bias, **kwargs)
 
         def forward(self, input_):
-            output_parallel = super().forward(input_)
+            output_parallel = super().forward(input_).to(input_.dtype)
             if self.gather_output:
                 # All-gather across the partitions.
                 output = gather_from_tensor_model_parallel_region(output_parallel)
@@ -100,7 +100,7 @@ def forward(self, input_):
                 input_parallel = input_
             else:
                 input_parallel = scatter_to_tensor_model_parallel_region(input_)
-            output_parallel = super().forward(input_parallel)
+            output_parallel = super().forward(input_parallel).to(input_.dtype)
             if self.reduce_results and self.world_size > 1:
                 output = reduce_from_tensor_model_parallel_region(output_parallel)
             else:
@@ -126,7 +126,7 @@ def forward(self, input_):
             # All-gather across the partitions.
             if self.input_is_parallel:
                 input_ = gather_from_tensor_model_parallel_region(input_)
-            output = super().forward(input_)
+            output = super().forward(input_).to(input_.dtype)
             return output, None
 
     if isinstance(module, QuantLinear):