diff --git a/scripts/question_answering/models.py b/scripts/question_answering/models.py
index 7458024840..7786c98172 100644
--- a/scripts/question_answering/models.py
+++ b/scripts/question_answering/models.py
@@ -58,13 +58,67 @@ def hybrid_forward(self, F, tokens, token_types, valid_length, p_mask):
             contextual_embeddings = self.backbone(tokens, token_types, valid_length)
         else:
             contextual_embeddings = self.backbone(tokens, valid_length)
-        scores = self.qa_outputs(contextual_embedding)
+        scores = self.qa_outputs(contextual_embeddings)
         start_scores = scores[:, :, 0]
         end_scores = scores[:, :, 1]
         start_logits = masked_logsoftmax(F, start_scores, mask=p_mask, axis=-1)
         end_logits = masked_logsoftmax(F, end_scores, mask=p_mask, axis=-1)
         return start_logits, end_logits
 
+    def inference(self, tokens, token_types, valid_length, p_mask,
+                  start_top_n: int = 5, end_top_n: int = 5):
+        """Get the inference result with beam search
+
+        Parameters
+        ----------
+        tokens
+            The input tokens. Shape (batch_size, sequence_length)
+        token_types
+            The input token types. Shape (batch_size, sequence_length)
+        valid_length
+            The valid length of the tokens. Shape (batch_size,)
+        p_mask
+            The mask which indicates that some tokens won't be used in the calculation.
+            Shape (batch_size, sequence_length)
+        start_top_n
+            The number of candidates to select for the start position.
+        end_top_n
+            The number of candidates to select for the end position.
+
+        Returns
+        -------
+        start_top_logits
+            The top start logits
+            Shape (batch_size, start_top_n)
+        start_top_index
+            Index of the top start logits
+            Shape (batch_size, start_top_n)
+        end_top_logits
+            The top end logits.
+            Shape (batch_size, end_top_n)
+        end_top_index
+            Index of the top end logits
+            Shape (batch_size, end_top_n)
+        """
+        # Shape (batch_size, sequence_length, C)
+        if self.use_segmentation:
+            contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+        else:
+            contextual_embeddings = self.backbone(tokens, valid_length)
+        scores = self.qa_outputs(contextual_embeddings)
+        start_scores = scores[:, :, 0]
+        end_scores = scores[:, :, 1]
+        start_logits = masked_logsoftmax(mx.nd, start_scores, mask=p_mask, axis=-1)
+        end_logits = masked_logsoftmax(mx.nd, end_scores, mask=p_mask, axis=-1)
+        # The shape of start_top_index will be (..., start_top_n)
+        start_top_logits, start_top_index = mx.npx.topk(start_logits, k=start_top_n, axis=-1,
+                                                        ret_typ='both')
+        # Note that end_top_index and end_top_log_probs have shape (bsz, start_n_top, end_n_top)
+        # So that for each start position, there are end_n_top end positions on the third dim.
+        end_top_logits, end_top_index = mx.npx.topk(end_logits, k=end_top_n, axis=-1,
+                                                    ret_typ='both')
+        return start_top_logits, start_top_index, end_top_logits, end_top_index
+
 
 @use_np
 class ModelForQAConditionalV1(HybridBlock):