# Evaluation of Vanilla RAG Chain without Context Enrichment through auxillary model

In [1]:
from dotenv import load_dotenv
import os
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

model = ChatGoogleGenerativeAI(model='gemini-2.0-flash-lite', google_api_key=os.environ["GOOGLE_API_KEY"])

In [1]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('sample.pdf')
docs = loader.load()

In [3]:
text = ''

for i in range(len(docs)):
    text += docs[i].page_content

text

'Preprint. Under review.\nSpeculative Thinking: Enhancing Small-Model Reasoning\nwith Large Model Guidance at Inference Time\nWang Yang1, Xiang Yue2, Vipin Chaudhary1, Xiaotian Han1\n1Case Western Reserve University 2Carnegie Mellon University\n{wxy320,vxc204,xhan}@case.edu xyue2@andrew.cmu.edu\nAbstract\nRecent advances leverage post-training to enhance model reasoning perfor-\nmance, which typically requires costly training pipelines and still suffers\nfrom inefficient, overly lengthy outputs. We introduce Speculative Think-\ning1, a training-free framework that enables large reasoning models to\nguide smaller ones during inference at the reasoning level, distinct from\nspeculative decoding, which operates at the token level. Our approach\nis based on two observations: (1) reasoning-supportive tokens such as\n“wait” frequently appear after structural delimiters like “\\n\\n”, serving as\nsignals for reflection or continuation; and (2) larger models exhibit stronger\ncontrol over refl

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_text(text)

NameError: name 'text' is not defined

In [8]:
from langchain_core.documents import Document

docs = [Document(page_content=item) for item in chunks]

In [11]:
len(docs)

69

In [12]:
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [16]:
embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001', google_api_key = os.environ['GOOGLE_API_KEY'])
vector_store = FAISS.from_documents(docs, embeddings)

In [20]:
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':4})

In [18]:
from langchain_core.runnables import RunnableParallel, RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [17]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(template="""
You are a helpful assistant.
Answer ONLY from the provided transcript context.
If the context is insufficient, just say you don't know.
Context: {context}
Question: {question}
                        """, input_variables=['context', 'question'])

In [77]:
def format_docs(retrieved_docs):
    # Handle both list[Document] and single Document
    if not isinstance(retrieved_docs, list):
        retrieved_docs = [retrieved_docs]

    return "\n\n".join(doc.page_content for doc in retrieved_docs)

parallel_chain = RunnableParallel(
    {
    "context": retriever | RunnableLambda(format_docs),
    "question": RunnablePassthrough()
    }
)
parser = StrOutputParser()
normal_chain = parallel_chain | prompt | model | parser
main_chain = parallel_chain | normal_chain

In [None]:
from langchain_core.runnables import RunnableLambda

def format_docs(retrieved_docs):
    return "\n\n".join(doc.page_content for doc in retrieved_docs)

# parallel_chain produces {"context": str, "question": str}
parallel_chain = RunnableParallel({
    "context": retriever | RunnableLambda(format_docs),
    "question": RunnablePassthrough()
})

parser = StrOutputParser()
normal_chain = prompt | model | parser   # expects {"context", "question"} and gives string

# Wrap to include both answer + raw retrieved docs


def with_docs(inputs):
    # raw retrieval
    raw_docs = retriever.invoke(inputs["question"])

    # force convert list[Document] -> string
    formatted_context = "\n\n".join([doc.page_content for doc in raw_docs])

    # run prompt + model
    answer = normal_chain.invoke({
        "context": formatted_context,
        "question": inputs["question"]
    })

    # return both answer + contexts (list of str for Ragas)
    return {
        "answer": answer,
        "contexts": [doc.page_content for doc in raw_docs]
    }

main_chain = RunnableLambda(with_docs)


In [82]:
main_chain.invoke({"question": "What is the document about?"})

{'answer': 'The document is about comparing Decode and Prefix stages, Speculative Thinking, and Speculative Decoding.',
 'contexts': ['�3\n�1, �1\n�2, �2\n�3, �3\n�4\nDecode\nPrefix\n   kv \ncache\n(a) decode v.s. prefix\n0 50 100 150 200 250\nGenerated T oken Num\n0\n2\n4\n6\n8Time (seconds)\ndecode\nprefix (b) Deepseek-1.5B\n0 50 100 150 200 250\nGenerated T oken Num\n0\n5\n10\n15\n20Time (seconds)\ndecode\nprefix (c) Deepseek-32B\nFigure 7: Comparison between Decode and Prefix stages: average time consumed by the\n1.5B and 32B models when generating different numbers of output tokens. As the number\nincreases, decoding time grows significantly, while prefix time remains nearly constant.\nA.2 Hyperparameters of Speculative Thinking\nA sentence is labeled Affirmation or Reflection if it contains affirmation cues (e.g., yes, yep)\nor backtracking cues (e.g., wait, alternatively); and Statement if neither type is present. If\nboth Affirmation and Reflection keywords appear, the decision

## Now making the eval dataset

In [32]:
docs

[Document(metadata={}, page_content='Preprint. Under review.\nSpeculative Thinking: Enhancing Small-Model Reasoning\nwith Large Model Guidance at Inference Time\nWang Yang1, Xiang Yue2, Vipin Chaudhary1, Xiaotian Han1\n1Case Western Reserve University 2Carnegie Mellon University\n{wxy320,vxc204,xhan}@case.edu xyue2@andrew.cmu.edu\nAbstract\nRecent advances leverage post-training to enhance model reasoning perfor-\nmance, which typically requires costly training pipelines and still suffers\nfrom inefficient, overly lengthy outputs. We introduce Speculative Think-\ning1, a training-free framework that enables large reasoning models to\nguide smaller ones during inference at the reasoning level, distinct from\nspeculative decoding, which operates at the token level. Our approach\nis based on two observations: (1) reasoning-supportive tokens such as\n“wait” frequently appear after structural delimiters like “\\n\\n”, serving as\nsignals for reflection or continuation; and (2) larger models

In [61]:
json_string1 = """[
  {
    "question": [
      "Who are the authors of the paper 'Speculative Thinking: Enhancing Small-Model Reasoning with Large Model Guidance at Inference Time'?",
      "Which institutions are the authors affiliated with?",
      "What key limitation of existing post-training approaches does this paper aim to overcome?"
    ],
    "ground_truth": [
      "Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han",
      "Case Western Reserve University and Carnegie Mellon University",
      "They require costly training pipelines and still produce inefficient, overly lengthy outputs."
    ]
  },
  {
    "question": [
      "By how much did the 1.5B model’s accuracy improve on MATH500 when assisted by the 32B reasoning model?",
      "What was the reduction in average output length for the 1.5B model on MATH500 with assistance?",
      "What accuracy improvement did the framework provide for Qwen-2.5-7B-Instruct on MATH500?"
    ],
    "ground_truth": [
      "6.2%, from 83.2% to 89.4%",
      "15.7%, from 5439 tokens to 4583 tokens",
      "7.8%, from 74.0% to 81.8%"
    ]
  },
  {
    "question": [
      "Which four datasets were used to evaluate speculative thinking in Figure 1?",
      "What does '1.5B+32B' represent in the evaluation?",
      "Where is the official code for speculative thinking available?"
    ],
    "ground_truth": [
      "AIME 2020–2024, MATH500, GPQA, and AMC23",
      "The 32B model supervises reflective reasoning steps of the 1.5B model during inference.",
      "https://github.com/uservan/speculative_thinking"
    ]
  },
  {
    "question": [
      "Why are smaller language models widely used in real-world applications?",
      "What challenges do smaller models face on reasoning tasks?",
      "What kind of post-training is often applied to improve smaller models?"
    ],
    "ground_truth": [
      "Because of their lower computational and memory requirements.",
      "They often underperform on tasks requiring complex reasoning.",
      "Supervised fine-tuning on reasoning traces or reinforcement learning with verifiable signals."
    ]
  },
  {
    "question": [
      "What motivates the question of whether small reasoning models can be improved during inference without additional training?",
      "What is speculative decoding?",
      "Why are larger models impractical for many deployment scenarios?"
    ],
    "ground_truth": [
      "Small models show limited improvements with training-free inference methods, while larger models are much stronger but costly.",
      "An approach where a small model proposes tokens and a larger model verifies them to accelerate generation.",
      "Because their inference cost and latency are too high."
    ]
  },
  {
    "question": [
      "What is the main difference between speculative thinking and speculative decoding?",
      "Which structural cues are used to identify challenging reasoning segments?",
      "How do larger models compare to smaller ones in handling reflective reasoning segments?"
    ],
    "ground_truth": [
      "Speculative thinking operates at the reasoning level, while speculative decoding works at the token level.",
      "Paragraph breaks followed by reflective phrases like 'wait' or 'alternatively'.",
      "Larger models are more concise and effective at backtracking."
    ]
  },
  {
    "question": [
      "What accuracy improvements did a 1.5B model achieve with speculative thinking on AIME, MATH500, GPQA, and AMC23?",
      "How did speculative thinking affect non-reasoning models like Qwen-2.5-7B-Instruct?",
      "What is the overall paradigm shift that speculative thinking introduces?"
    ],
    "ground_truth": [
      "+6.6% on AIME, +6.2% on MATH500, +8.1% on GPQA, and +5.0% on AMC23",
      "It gained +7.8% on MATH500 and +14.2% on GPQA with large model assistance.",
      "A new inference-time paradigm fusing small-model efficiency with large-model reasoning strength."
    ]
  },
  {
    "question": [
      "What role does '\\n\\n' play in model reasoning processes?",
      "Which reasoning-supportive tokens often appear after '\\n\\n'?",
      "What dataset was used to analyze the distribution of preceding tokens for reasoning-supportive words?"
    ],
    "ground_truth": [
      "It acts as a structural clue, often triggering reflective or continuation behavior.",
      "Tokens such as 'wait', 'hmm', and 'alternatively'.",
      "MATH500 dataset."
    ]
  },
  {
    "question": [
      "According to Table 1, which symbol most frequently precedes reasoning-supportive tokens?",
      "What proportion of 'wait' tokens appear after '\\n\\n'?",
      "What does this suggest about the role of '\\n\\n'?"
    ],
    "ground_truth": [
      "The newline symbol '\\n\\n'.",
      "Over 80%.",
      "It acts as a thinking cue prompting reflection or continuation."
    ]
  },
  {
    "question": [
      "What are the three segment types identified in case analysis using '\\n\\n'?",
      "What does a reflection segment indicate?",
      "What is suggested by the first sentence after each '\\n\\n'?"
    ],
    "ground_truth": [
      "Affirmation, Reflection, and Statement.",
      "That the model intends to reflect on its previous thought.",
      "That it often contains reasoning-related cues."
    ]
  },
  {
    "question": [
      "Which models were compared in Section 2.2?",
      "What dataset was used for their comparison?",
      "What two performance metrics were analyzed?"
    ],
    "ground_truth": [
      "Deepseek-distilled Qwen-2.5-32B, 7B, and 1.5B.",
      "AIME 2022–2024 dataset.",
      "Accuracy and output length."
    ]
  },
  {
    "question": [
      "What general trend was observed between model size, accuracy, and output length?",
      "Why are incorrect responses typically longer than correct ones?",
      "What role do reflective phrases play in incorrect responses of smaller models?"
    ],
    "ground_truth": [
      "Larger models show higher accuracy and shorter outputs, while smaller models are less accurate and longer.",
      "Because they contain excessive self-reflection and redundant reasoning.",
      "They appear more frequently, signaling hesitation and ineffective backtracking."
    ]
  },
  {
    "question": [
      "What pattern do small models overuse when generating incorrect answers?",
      "How does speculative thinking propose to use '\\n\\n'?",
      "Why can larger models provide more accurate reasoning at reflective points?"
    ],
    "ground_truth": [
      "They overuse words like 'wait', indicating excessive self-reflection.",
      "As a control point to delegate reasoning segments to larger models.",
      "Because they are better at concise reasoning and avoiding redundant backtracking."
    ]
  },
  {
    "question": [
      "In speculative thinking, what roles do the speculative and target models play?",
      "What triggers an Affirmation/Reflection takeover?",
      "What happens when the speculative model generates an affirmation or reflection sentence after a delimiter?"
    ],
    "ground_truth": [
      "The small model is the speculative model, and the large model is the target model providing supervision.",
      "When a delimiter '\\n\\n' is followed by an affirmation or reflection sentence.",
      "The target model takes over and generates the next n1 tokens."
    ]
  },
  {
    "question": [
      "What kind of takeover is triggered by verification-related cues?",
      "What does the negativity counter 'c' track?",
      "What auxiliary mechanism is used to prevent excessive reflection loops?"
    ],
    "ground_truth": [
      "Verification takeover.",
      "The number of reflection sentences generated after '\\n\\n'.",
      "Inserting an auxiliary sentence and delegating the next n3 tokens to the target model."
    ]
  },
  {
    "question": [
      "What does the 'modify ratio' indicate in Table 2?",
      "How did speculative thinking affect the 1.5B model’s estimated inference speed when paired with a 32B model on AIME?",
      "What was the improvement in accuracy for the 1.5B model on GPQA with 32B assistance?"
    ],
    "ground_truth": [
      "The proportion of tokens in the final output that come from the target model.",
      "+185.9% compared to the standalone 1.5B model.",
      "+8.1% improvement."
    ]
  },
  {
    "question": [
      "Which benchmark datasets were used in the experiments described in Section 4?",
      "What three evaluation metrics were used to assess speculative thinking?",
      "How did speculative thinking affect the 1.5B model’s accuracy on AMC23?"
    ],
    "ground_truth": [
      "AIME 2022–2024, GPQA-Diamond, MATH500, and AMC23.",
      "Accuracy, average output length, and estimated inference speed.",
      "It increased by 5.0% with 32B assistance."
    ]
  },
  {
    "question": [
      "How much output modification by the target model was needed to significantly improve speculative model reasoning?",
      "How did the 1.5B speculative model assisted by 32B perform in terms of efficiency compared to the standalone 32B?",
      "What trade-off does speculative thinking offer according to the analysis?"
    ],
    "ground_truth": [
      "About 20% of the speculative model’s output.",
      "It outperformed the standalone 32B in generation speed.",
      "A trade-off between performance and computational efficiency."
    ]
  },
  {
    "question": [
      "In speculative decoding experiments, what was the size of the speculative and target models?",
      "How many tokens at a time does Speculative Thinking take over during generation?",
      "What major issue does speculative decoding face compared to Speculative Thinking?"
    ],
    "ground_truth": [
      "The speculative model was 7B and the target model was 32B.",
      "20 tokens at a time.",
      "It suffers from a high rejection rate where nearly 50% of tokens need to be regenerated by the target model."
    ]
  },
  {
    "question": [
      "Why does speculative thinking avoid the high rejection rate issue found in speculative decoding?",
      "What are the two major categories of current approaches to enhancing LLM reasoning?",
      "Which project achieved state-of-the-art reasoning performance using GRPO?"
    ],
    "ground_truth": [
      "Because the target model only intervenes when necessary, rather than regenerating rejected tokens.",
      "Reinforcement learning and supervised fine-tuning.",
      "DeepSeek."
    ]
  },
  {
    "question": [
      "What was the goal of works replicating DeepSeek-R1?",
      "Which studies emphasized the importance of reasoning step structure over content?",
      "According to Ji et al. (2025), which part of reasoning instances is especially important for model performance?"
    ],
    "ground_truth": [
      "To uncover potential 'aha moments' in reasoning.",
      "Li et al. (2025a).",
      "The initial few tokens in each reasoning instance."
    ]
  },
  {
    "question": [
      "Which method was introduced by Kimi 1.5 to address verbose outputs in reasoning models?",
      "What does TokenSkip aim to achieve?",
      "What is the main idea behind LightThinker?"
    ],
    "ground_truth": [
      "The Long-to-Short method.",
      "It improves efficiency by removing redundant or uninformative tokens from training data.",
      "It compresses intermediate thoughts to produce shorter yet informative reasoning traces."
    ]
  },
  {
    "question": [
      "According to Wang et al. (2025) and Sui et al. (2025a), what happens to model output length when reasoning fails?",
      "Which method detects and terminates reasoning early?",
      "What is the central proposal of Speculative Thinking in the conclusion section?"
    ],
    "ground_truth": [
      "Outputs become significantly longer due to repetitive generation of supportive tokens like 'wait'.",
      "Dynasor.",
      "It leverages larger reasoning models to guide smaller ones through selective delegation at meaningful points."
    ]
  },
  {
    "question": [
      "What natural reasoning cues do LLMs exploit that Speculative Thinking leverages?",
      "On which dataset did experiments show gains in accuracy, average output length, and efficiency?",
      "What paradigm does the conclusion highlight for model collaboration?"
    ],
    "ground_truth": [
      "Reflection cues such as '\\n\\n'.",
      "MATH500.",
      "Collaborative inference between models of different capacities without additional training."
    ]
  },
  {
    "question": [
      "What is one limitation of Speculative Thinking regarding the target model?",
      "Why does the implementation assume both models are from the same family?",
      "What example of a prompt format is considered important for achieving the best results?"
    ],
    "ground_truth": [
      "It requires the target model to have stronger reasoning abilities than the speculative model.",
      "To leverage shared KV cache structures for faster inference.",
      "Please reason step by step, and put your final answer within \\boxed{}."
    ]
},
  {
    "question": [
      "Which keywords indicate reflection or hesitation in the framework?",
      "If both reflection and affirmation cues appear in a sentence, which label is chosen in case of a tie?",
      "What is the configured takeover length for verification-based intervention?"
    ],
    "ground_truth": [
      "“wait”, “alternatively”, “hold on”, “another”, “verify”, “think again”, “recap”, “check”.",
      "Reflection.",
      "125 tokens."
    ]
  }
]
"""

In [64]:
import json
data = json.loads(json_string1)

In [69]:
# install
# pip install ragas langchain-openai datasets

from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from datasets import Dataset, load_dataset

# suppose you already have your rag_chain (langchain Runnable)

# 1. Prepare eval dataset (questions + ground truth answers)

dataset = Dataset.from_list(data)

# 2. Run your RAG chain and collect answers + retrieved contexts
def run_chain(example):
    out = main_chain.invoke({"question": example["question"]})
    return {
        "question": example["question"],
        "answer": out["answer"],
        "contexts": out["contexts"],   # correct key, list[str]
        "ground_truth": example["ground_truth"]
    }

predictions = [run_chain(ex) for ex in dataset]

eval_dataset = Dataset.from_list(predictions)

# 3. Evaluate with Ragas
result = evaluate(
    eval_dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall]
)

print(result)


TypeError: expected string or bytes-like object, got 'list'

In [74]:
out = main_chain.invoke({"question": dataset[0]["question"]})
print(out)
print(type(out["answer"]))
print(type(out["contexts"]))

TypeError: expected string or bytes-like object, got 'list'

In [75]:
test_in = {
    "context": format_docs(retriever.invoke(dataset[0]["question"])),
    "question": dataset[0]["question"]
}
print(type(test_in["context"]))
print(type(test_in["question"]))

print("CONTEXT PREVIEW:", test_in["context"][:200])
print("QUESTION:", test_in["question"])

# Now just run prompt directly
print(prompt.format(**test_in))

TypeError: expected string or bytes-like object, got 'list'

In [83]:
docs = retriever.invoke(dataset[0]["question"])
print("TYPE of retriever output:", type(docs))
print("First element:", docs[0])
print("Is list?", isinstance(docs, list))

formatted = format_docs(docs)
print("TYPE of formatted:", type(formatted))
print("Preview formatted:", formatted[:200])

TypeError: expected string or bytes-like object, got 'list'

In [78]:
type(docs)

list

In [80]:
docs = retriever.invoke(dataset[0]["question"])
formatted = format_docs(docs)

print("FORMATTED TYPE:", type(formatted))       # should be str
print("QUESTION TYPE:", type(dataset[0]["question"]))  # should be str

print(prompt.input_variables)  # see what the prompt expects
print(prompt.format(context=formatted, question=dataset[0]["question"]))


TypeError: expected string or bytes-like object, got 'list'

In [86]:
out = with_docs({"question": dataset[0]["question"]})

TypeError: expected string or bytes-like object, got 'list'