In [32]:
from dotenv import load_dotenv
import os
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

model = ChatGoogleGenerativeAI(model='gemini-2.0-flash-lite', google_api_key=os.environ["GOOGLE_API_KEY"])

In [2]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('sample.pdf')
docs = loader.load()

In [3]:
len(docs)

17

In [4]:
text = ''

for i in range(len(docs)):
    text += docs[i].page_content

from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_text(text)

from langchain_core.documents import Document

docs = [Document(page_content=item) for item in chunks]
len(docs)


69

In [5]:
len(docs[0].page_content)

963

In [13]:
divided = "<sep>"

for doc in docs:
    divided += doc.page_content + "<sep>"

divided

'<sep>Preprint. Under review.\nSpeculative Thinking: Enhancing Small-Model Reasoning\nwith Large Model Guidance at Inference Time\nWang Yang1, Xiang Yue2, Vipin Chaudhary1, Xiaotian Han1\n1Case Western Reserve University 2Carnegie Mellon University\n{wxy320,vxc204,xhan}@case.edu xyue2@andrew.cmu.edu\nAbstract\nRecent advances leverage post-training to enhance model reasoning perfor-\nmance, which typically requires costly training pipelines and still suffers\nfrom inefficient, overly lengthy outputs. We introduce Speculative Think-\ning1, a training-free framework that enables large reasoning models to\nguide smaller ones during inference at the reasoning level, distinct from\nspeculative decoding, which operates at the token level. Our approach\nis based on two observations: (1) reasoning-supportive tokens such as\n“wait” frequently appear after structural delimiters like “\\n\\n”, serving as\nsignals for reflection or continuation; and (2) larger models exhibit stronger<sep>“wait” fr

In [24]:
len(divided.split("<sep>"))

71

In [41]:
divided
len(divided)
len(divided.split("<sep>"))

71

In [None]:
from langchain_groq import ChatGroq
model2 = ChatGoogleGenerativeAI(model='gemini-2.0-flash-lite', google_api_key='...')

In [42]:
prompt = f"""You are a preprocessing assistant. Your task is to rewrite text chunks into self-contained units that can be understood without additional context. Do not add new information; only restate or clarify using the text itself."

The following text contains multiple chunks separated by the token <sep>.  

Text: {divided}

Task:  
- For each chunk, rewrite it so it is fully self-contained and understandable without other chunks.  
- Do not add new facts, only restate missing references.  
- Keep meaning and details intact.  
- Output each rewritten chunk in order, labeled clearly.  
- KEEP THE NUMBER OF CHUNKS SAME AND INTACT.

Number of Chunks: {len(divided.split("<sep>"))}
Example output(JSON):
{{
    "0":"Chunk1",
    "1":"Chunk2",
}}

"""

contextualized3 = model2.invoke(prompt)


In [43]:
contextualized3

AIMessage(content='```json\n{\n    "0": "This is a preprint under review. The paper introduces Speculative Thinking, a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference. It focuses on the reasoning level, unlike speculative decoding, which operates at the token level. The authors are Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han, affiliated with Case Western Reserve University and Carnegie Mellon University.",\n    "1": "The paper\'s abstract explains that existing methods often require costly training and produce inefficient outputs. Speculative Thinking uses large reasoning models to guide smaller ones during inference. This is achieved by observing that reasoning-supportive tokens like \\"wait\\" often follow structural delimiters like \\"\\\\n\\\\n\\". Larger models exhibit better control over reflective behavior. The method boosts reasoning accuracy in smaller models and shortens their outputs. For 

In [44]:
len(contextualized3.content)

27889

In [45]:
json_string = "{" + contextualized3.content[12:]
json_string 

'{  "0": "This is a preprint under review. The paper introduces Speculative Thinking, a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference. It focuses on the reasoning level, unlike speculative decoding, which operates at the token level. The authors are Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han, affiliated with Case Western Reserve University and Carnegie Mellon University.",\n    "1": "The paper\'s abstract explains that existing methods often require costly training and produce inefficient outputs. Speculative Thinking uses large reasoning models to guide smaller ones during inference. This is achieved by observing that reasoning-supportive tokens like \\"wait\\" often follow structural delimiters like \\"\\\\n\\\\n\\". Larger models exhibit better control over reflective behavior. The method boosts reasoning accuracy in smaller models and shortens their outputs. For example, the 1.5B model\'s accu

In [46]:
json_string2 = json_string[:-7] + '"}'
json_string2

'{  "0": "This is a preprint under review. The paper introduces Speculative Thinking, a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference. It focuses on the reasoning level, unlike speculative decoding, which operates at the token level. The authors are Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han, affiliated with Case Western Reserve University and Carnegie Mellon University.",\n    "1": "The paper\'s abstract explains that existing methods often require costly training and produce inefficient outputs. Speculative Thinking uses large reasoning models to guide smaller ones during inference. This is achieved by observing that reasoning-supportive tokens like \\"wait\\" often follow structural delimiters like \\"\\\\n\\\\n\\". Larger models exhibit better control over reflective behavior. The method boosts reasoning accuracy in smaller models and shortens their outputs. For example, the 1.5B model\'s accu

In [47]:
print(type(json_string))   

<class 'str'>


In [48]:
import json
data = json.loads(json_string2)

In [None]:
data

{'0': 'This is a preprint under review. The paper introduces Speculative Thinking, a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference. It focuses on the reasoning level, unlike speculative decoding, which operates at the token level. The authors are Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han, affiliated with Case Western Reserve University and Carnegie Mellon University.',
 '1': 'The paper\'s abstract explains that existing methods often require costly training and produce inefficient outputs. Speculative Thinking uses large reasoning models to guide smaller ones during inference. This is achieved by observing that reasoning-supportive tokens like "wait" often follow structural delimiters like "\\n\\n". Larger models exhibit better control over reflective behavior. The method boosts reasoning accuracy in smaller models and shortens their outputs. For example, the 1.5B model\'s accuracy on MATH500 inc

In [51]:
{'0': 'This is a preprint under review. The paper introduces Speculative Thinking, a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference. It focuses on the reasoning level, unlike speculative decoding, which operates at the token level. The authors are Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han, affiliated with Case Western Reserve University and Carnegie Mellon University.',
 '1': 'The paper\'s abstract explains that existing methods often require costly training and produce inefficient outputs. Speculative Thinking uses large reasoning models to guide smaller ones during inference. This is achieved by observing that reasoning-supportive tokens like "wait" often follow structural delimiters like "\\n\\n". Larger models exhibit better control over reflective behavior. The method boosts reasoning accuracy in smaller models and shortens their outputs. For example, the 1.5B model\'s accuracy on MATH500 increases from 83.2% to 89.4% with the help of a 32B model. The average output length decreased by 15.7%. The framework also improves non-reasoning models, such as Qwen-2.5-7B-Instruct, on the same benchmark.',
 '2': "The provided results demonstrate the effectiveness of Speculative Thinking. On the AIME dataset, the accuracy improvement for the 1.5B model was +6.7% with a -11.8% decrease in average output length. On the MATH500 dataset, the 1.5B model's accuracy increased by +6.2% with a -15.7% decrease in length. The GPQA dataset saw an +8.1% increase in accuracy and a -4.0% decrease in length. The AMC23 dataset showed a +5.0% increase in accuracy and a -16.9% decrease in average output length.",
 '3': 'Figure 1 illustrates that Speculative Thinking significantly improves the reasoning accuracy of the 1.5B model while reducing the average output length. The figure compares the accuracy and output length of models on four datasets: AIME 2020–2024, MATH500, GPQA, and AMC23. The Deepseek-Distilled Qwen 2.5-1.5B model is referred to as "1.5B," the Deepseek-Distilled Qwen 2.5-32B model is referred to as "32B," and the proposed Speculative Thinking method (where the 32B model supervises the 1.5B model\'s reflective reasoning steps) is represented as "1.5B+32B." The code is available at https://github.com/uservan/speculative_thinking.',
 '4': 'This is a preprint under review. The paper is titled "Speculative Thinking: Enhancing Small-Model Reasoning with Large Model Guidance at Inference Time." The introduction discusses the widespread use of smaller language models in real-world applications due to their lower computational and memory requirements. These models often underperform on complex reasoning tasks. Current methods to improve them, such as supervised fine-tuning or reinforcement learning, can be costly and difficult to scale.',
 '5': 'The introduction continues by mentioning that inference-time scaling methods have been proposed to improve small models without retraining. However, these methods often yield limited improvements, especially on complex tasks. Larger models have stronger reasoning abilities but are impractical for many deployment scenarios due to inference cost and latency. The central question is whether small models can be improved during inference by selectively using large models, without retraining.',
 '6': 'Inspired by speculative decoding (Leviathan et al., 2023), the paper proposes Speculative Thinking, a training-free framework to improve small-model reasoning during inference. Unlike speculative decoding, which operates at the token level, this approach focuses on the reasoning level. A small model generates most of the output but delegates difficult reasoning segments to a stronger model. These segments are identified through structural cues, like paragraph breaks ("\\n\\n") followed by reflective phrases, such as "wait" and "alternatively."',
 '7': "The Speculative Thinking framework aims to preserve the small model's efficiency while leveraging the large model's strength. Empirical results show that a 1.5B model, assisted by Deepseek-distilled Qwen-2.5-32B, improved by +6.6% on AIME, +6.2% on MATH500 (Lightman et al., 2023), +8.1% on GPQA (Rein et al., 2024), and +5.0% on AMC23, while reducing output length. This approach is also effective for non-reasoning models. Qwen-2.5-7B-Instruct gained +7.8% on MATH500 and +14.2% on GPQA when assisted by the 32B mentor.",
 '8': 'In summary, Speculative Thinking offers a new inference-time paradigm that combines the efficiency of small models with the reasoning strength of large models, potentially leading to cost-effective reasoning augmentation.',
 '9': 'The paper\'s second section is titled "2 Motivations." It discusses the analysis of LLM reasoning processes. Reasoning models often generate reasoning-supportive tokens like "wait," "hmm," and "alternatively," which are related to self-reflection behavior. The authors analyzed the preceding tokens for these reasoning-supportive tokens in the Deepseek-distilled Qwen-2.5-32B model on the MATH500 dataset.',
 '10': 'The analysis of reasoning-supportive tokens shows that "wait", "alternatively", and "hmm" are frequently preceded by structural delimiters such as "\\n\\n". For instance, in the case of "wait", over 80% of its preceding tokens are "\\n\\n". This suggests that "\\n\\n" acts as a thinking cue, prompting the model to reflect or continue reasoning. The same analysis was extended to other models on the MATH500 dataset in Appendix A.4.',
 '11': 'The authors conducted a case study on responses generated by Deepseek-distilled Qwen-2.5-1.5B and Qwen-2.5-1.5B-Instruct to further prove the role of "\\n\\n". The study segmented the model\'s output into Affirmation, Reflection, and Statement based on the presence of specific keywords. The authors found that sentences following "\\n\\n" often contain reasoning-related cues, suggesting that "\\n\\n" is a discourse marker.',
 '12': 'The second part of the "2 Motivations" section addresses comparisons between small and large reasoning models. The authors compared reasoning models of different sizes, including Deepseek-distilled Qwen-2.5-32B, 7B, and 1.5B, focusing on accuracy and output length on the AIME 2022-2024 dataset. The results are shown in Figure 3, and detailed statistics on other datasets can be found in Appendix A.5.',
 '13': 'The conclusion from the comparison of models of different sizes is that small reasoning models have worse reasoning performance and much longer responses. The authors reported accuracy and average output length for the three models. Smaller models have significantly lower accuracy, while their average output length tends to be much longer. As model size increases, accuracy improves, and outputs become more concise. The average length of incorrect responses is consistently longer than correct ones.',
 '14': 'The analysis concludes that larger-scale models exhibit more effective self-reflection and backtracking during reasoning. The authors analyzed the frequency of reflective phrases, such as "wait" and "alternatively," which indicate hesitation, self-reflection, or backtracking in the reasoning process. These phrases occur more frequently in incorrect responses, particularly in smaller models. This suggests that smaller models tend to over-reflect, leading to inefficient exploration of the solution space.',
 '15': 'The third section of the "2 Motivations" section asks, "2.3 How to Combine Small and Large Reasoning Model?" The authors observe that when reasoning models generate incorrect answers, their average output length increases significantly, often accompanied by excessive use of words like "wait." The question is whether the reasoning ability of larger models can be leveraged to monitor smaller models during inference.',
 '16': 'The paper proposes a novel intervention strategy that utilizes the "\\n\\n" reasoning pattern as a control point for collaborative inference. When a smaller model encounters "\\n\\n" followed by tokens like "wait," the authors suggest delegating the subsequent reasoning step to a larger model. The larger model would then generate the next thought segment, acting as a reasoning supervisor or corrector. This large-model-aided intervention may enhance the robustness and accuracy of smaller models by injecting stronger reasoning capabilities.',
 '17': 'The third section of the paper is titled "3 Method: Speculative Thinking." The authors propose a collaborative inference framework called Speculative Thinking, where a small model acts as the speculative model and a large model serves as the target model. The speculative model performs primary reasoning, while the target model intervenes selectively. The overall framework is illustrated in Figure 4. The hyper-parameters for Speculative Thinking are shown in Appendix A.2.',
 '18': "The target model takes over the speculative model's generation under three scenarios: (1) Affirmation/Reflection Takeover. This mechanism uses the target model to decide whether to continue or revise. (2) Verification Takeover. This intervention is triggered when a \\n\\n delimiter is encountered, and the sentence contains verification cues. (3) Excessive Reflection Takeover. This intervention is implemented using a negativity counter that tracks the number of reflection sentences.",
 '19': 'Table 2 shows the accuracy, average output length, and estimated speed of models on four datasets. The 1.5B model refers to the Deepseek-Distilled Qwen-2.5-1.5B model. "+" means with the help of large models. Modify ratio indicates the proportion of tokens in the final output that come from the target model. After applying Speculative Thinking, both 1.5B and 7B models improved in accuracy, output length, and estimated inference speed.',
 '20': 'The results in Table 2 show the performance improvements with Speculative Thinking. The 1.5B model saw improvements in accuracy, and output length, and estimated inference speed. The 7B model also saw improvements in accuracy, output length, and estimated inference speed.',
 '21': 'The fourth section, "4 Experiments," focuses on the evaluation of Speculative Thinking. The authors adopted three key evaluation metrics: accuracy, average output length, and estimated inference speed. The rationale for choosing the estimated inference speed and the details of its computation are provided at the end of this section. Experiments were conducted on four benchmark datasets: AIME 2022–2024, GPQA-Diamond, MATH500, and AMC23.',
 '22': 'The section titled "4.1 Large Reasoning Models Monitor Small Reasoning Models" analyzes the results. The results are summarized in Table 2, demonstrating that the method consistently improves accuracy, reduces output length, and enhances inference speed. For example, after being assisted by the 32B target model, the 1.5B speculative model demonstrates consistent and significant improvements across multiple datasets.',
 '23': "The results demonstrate the effectiveness of Speculative Thinking. The 1.5B model's accuracy increased by 6.2% on MATH500, 8.1% on GPQA, 5.0% on AMC23, and 6.6% on AIME. In addition, the average output length decreased by 15.7%, 3.9%, 16.9%, and 11.7% on the same datasets, respectively. The 1.5B model assisted by the 32B model consistently outperforms the standalone 32B model in terms of generation speed. The target model only needs to modify approximately 20% of the speculative model’s output to significantly enhance its reasoning performance.",
 '24': 'Figure 5 shows a comparison between the prefix and decode stages, revealing that the time required to process multiple tokens during the prefix phase is nearly equivalent to the time taken to decode a single token.',
 '25': 'The paper provides a "Theoretical Estimation of FLOPs and Token Generation Speed." The authors used a theoretical analysis instead of empirical timing because Speculative Thinking introduces logical coordination between models. Runtime measurements would be significantly affected by backend GPU optimizations, especially in systems like vLLM (Kwon et al., 2023). The computation of FLOPs for prefill and decode stages is in Appendix A.1.',
 '26': 'The average inference time for both decode and prefix stages was empirically profiled across various model sizes and output token lengths. These measurements were obtained using the generate() API from HuggingFace Transformers, with key-value cache enabled for the prompt. The authors observed that the average time in the prefix stage remains relatively stable across positions when GPU memory is sufficient. To reflect the difference, a speedup for the prefix stage was assumed: FLOPsprefix(m) = FLOPsdecode(n = 1), where m and n mean the token number. The GPU computational capacity was set to 3.12 × 1010 FLOPs/s, which corresponds to an A100-class GPU. The estimated speed is calculated as follows: Estimated Speed = Total Tokens / (FLOPsprefill + FLOPsprefix + FLOPsdecode) / GPU Capacity (1).',
 '27': 'The section titled "4.2 Reasoning Models Monitor Non-Reasoning Models" asks if reasoning-capable models can enhance the performance and accuracy of non-reasoning models. The Speculative Thinking framework was adapted to monitor a speculative model that lacks inherent reasoning capability.',
 '28': 'The modifications for speculative thinking applied to non-reasoning models are explained. In Affirmation/Reflection Takeover, the target model directly generates the next sentence after each "\\n\\n." The target model generates the first 100 tokens before any question answering begins.',
 '29': 'The results of Reasoning Models Monitoring Non-Reasoning Models are analyzed. The results are shown in Table 3. Qwen-2.5-7B-Instruct, a non-reasoning model, benefits notably from speculative assistance by both 7B and 32B reasoning models. For instance, on the MATH500 dataset, its accuracy improves from 74.0% to 81.8%. However, this came at the cost of increased output length.',
 '30': 'The results of the experiment with non-reasoning models also showed that when assisted by the 1.5B reasoning model, performance improvements are not consistently observed. The authors state that it is preferable to choose a target model that is either of equal size or larger than the speculative model, and more importantly, possesses stronger reasoning capabilities. Mismatches where the speculative model is larger or stronger than the target model may lead to suboptimal or even detrimental outcomes.',
 '31': 'Section 4.3 is titled "Comparisons between Speculative Decoding and Speculative Thinking." Figure 6 compares speculative decoding and speculative thinking using a 7B speculative model and a 32B target model. In Speculative Decoding, the speculative model generates 20 tokens per step to match the number of intervention tokens in Speculative Thinking.',
 '32': 'The experiment compares speculative decoding and speculative thinking. Speculative decoding relies on the speculative and target models having similar token output distributions to accelerate generation. Speculative Thinking focuses on enhancing the speculative model’s reasoning with lightweight assistance from the target model, without strictly requiring token distributional alignment.',
 '33': 'Figure 6 shows that speculative decoding, although matching the accuracy of the 32B model, often suffers from a high rejection rate, which diminishes its speed. Speculative Thinking avoids this issue by allowing the target model to intervene only when necessary, improving the speculative model’s reasoning with minimal overhead.',
 '34': 'The fifth section, "5 Related Works," discusses related research. The section on LLM Reasoning notes that current approaches to enhancing the reasoning capabilities of language models primarily fall into reinforcement learning and supervised fine-tuning. For instance, DeepSeek (Guo et al., 2025; Liu et al., 2024) achieved state-of-the-art reasoning performance using GRPO (Shao et al., 2024; Yu et al., 2025), and further improved smaller models by distilling high-quality reasoning traces.',
 '35': 'The related work section continues by highlighting the inspiration from DeepSeek-R1 and efforts to replicate it, including works such as Logic RL (Xie et al., 2025) and SimpleRL-Zoo (Zeng et al., 2025). Many studies also use SFT to improve reasoning, including SkyThought-T1 (Team, 2025b) and Bespoke-Stratos-32B (Labs, 2025). Several works have further investigated key techniques for enhancing reasoning performance during RL (Baek & Tegmark, 2025; Yeo et al., 2025) or SFT (Chen et al., 2025b; 2024a; Tian et al., 2025; Liu et al., 2025b).',
 '36': 'The related works section continues by citing that (Li et al., 2025a) argues that the structure of reasoning steps in the data is more critical than the actual content. (Ji et al., 2025) highlights the importance of the initial few tokens in each reasoning instance for optimizing model performance. In addition, several recent studies—such as s1(Muennighoff et al., 2025) emphasize the value of selecting a small set of high-quality reasoning samples to drive efficient model improvement.',
 '37': 'The related works section also discusses efficient reasoning. Current reasoning models still exhibit notable limitations (Bandyopadhyay et al., 2025; Li et al., 2025c). One prominent issue is excessive response length. Efficient reasoning has become an emerging research focus.',
 '38': 'An early effort in efficient reasoning was proposed by Kimi 1.5 (Team et al., 2025), which introduced the Long-to-Short method. The idea was later reproduced by Sky-Thought (Team, 2025a). TokenSkip (Xia et al., 2025) improves efficiency by identifying and removing redundant tokens. LightThinker (Zhang et al., 2025) explicitly compresses intermediate thoughts to generate shorter reasoning traces. Wang et al. (2025); Sui et al. (2025a) highlight a counterintuitive phenomenon: when reasoning fails, model outputs often become significantly longer.',
 '39': 'Other notable approaches include Dynasor(Fu et al., 2024). There are some other works including efficient reaosning (Aytes et al., 2025; Lee et al., 2025; Sui et al., 2025c; Xu et al., 2025; Liao et al., 2025).',
 '40': 'The sixth section is "6 Conclusion." The authors propose Speculative Thinking, a training-free framework that leverages larger reasoning models to guide smaller ones through selective delegation at structurally meaningful points in generation. The approach significantly enhances both accuracy, average output length and efficiency without any additional training in four math reasoning datasets like MATH500. Experiments demonstrate substantial gains in performance and output conciseness.',
 '41': 'The conclusion continues by stating that the framework highlights a promising paradigm for improving reasoning of reasoning and non-reasoning models without additional data or training computation cost.',
 '42': 'The paper\'s section on "Limitations" states that Speculative Thinking relies on a larger target model to improve the reasoning ability and reduce the output length of a smaller speculative model. For the framework to be effective, the target model must possess stronger reasoning capabilities than the speculative model. The current implementation assumes that both models belong to the same model family, which allows the authors to leverage shared KV cache structures to accelerate inference.',
 '43': 'The limitations section continues by stating that the performance of Speculative Thinking is sensitive to prompt quality—utilizing an optimized prompt for each model is critical to achieving the best results, like “Please reason step by step, and put your final answer within \\boxed{}.”.',
 '44': 'The references section lists the following: Simon A Aytes, Jinheon Baek, and Sung Ju Hwang. Sketch-of-thought: Efficient llm reasoning with adaptive cognitive-inspired sketching. arXiv preprint arXiv:2503.05179, 2025.',
 '45': 'The references section continues listing the following: David D. Baek and Max Tegmark. Towards understanding distilled reasoning models: A representational approach, 2025. URL https://arxiv.org/abs/2503.03730.',
 '46': 'The references section continues listing the following: Dibyanayan Bandyopadhyay, Soham Bhattacharjee, and Asif Ekbal. Thinking machines: A survey of llm based reasoning strategies. arXiv preprint arXiv:2503.10814, 2025.',
 '47': 'The references section continues listing the following: Qiguang Chen, Libo Qin, Jiaqi Wang, Jingxuan Zhou, and Wanxiang Che. Unlocking the capabilities of thought: A reasoning boundary framework to quantify and optimize chain-of-thought. Advances in Neural Information Processing Systems, 37:54872–54904, 2024a.',
 '48': 'The references section continues listing the following: Qiguang Chen, Libo Qin, Jinhao Liu, Dengyun Peng, Jiannan Guan, Peng Wang, Mengkang Hu, Yuhang Zhou, Te Gao, and Wangxiang Che. Towards reasoning era: A survey of long chain-of-thought for reasoning large language models. arXiv preprint arXiv:2503.09567, 2025a.',
 '49': 'The references section continues listing the following: Xinghao Chen, Zhijing Sun, Wenjin Guo, Miaoran Zhang, Yanjun Chen, Yirong Sun, Hui Su, Yijie Pan, Dietrich Klakow, Wenjie Li, et al. Unveiling the key factors for distilling chain-of-thought reasoning. arXiv preprint arXiv:2502.18001, 2025b.',
 '50': 'The references section continues listing the following: Yushuo Chen, Tianyi Tang, Erge Xiang, Linjiang Li, Wayne Xin Zhao, Jing Wang, Yunpeng Chai, and Ji-Rong Wen. Towards coarse-to-fine evaluation of inference efficiency for large language models. arXiv preprint arXiv:2404.11502, 2024b.',
 '51': 'The references section continues listing the following: Li Chenglin, Qianglong Chen, Liangyue Li, Caiyu Wang, Feng Tao, Yicheng Li, Zulong Chen, and Yin Zhang. Mixed distillation helps smaller language models reason better. In Findings of the Association for Computational Linguistics: EMNLP 2024, pp. 1673–1690, 2024.',
 '52': 'The references section continues listing the following: Yichao Fu, Junda Chen, Siqi Zhu, Zheyu Fu, Zhongdongming Dai, Aurick Qiao, and Hao Zhang. Efficiently serving llm reasoning programs with certaindex. arXiv preprint arXiv:2412.20993, 2024.',
 '53': 'The references section continues listing the following: Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948, 2025.',
 '54': 'The references section continues listing the following: Xiaotian Han. Reproduce the inference time scaling exp, 2024. URL https://ahxt.github. io/blog/2024-12-30-inference-time-scaling-exp/ . 2024-12-30.',
 '55': 'The references section continues listing the following: Aaron Jaech, Adam Kalai, Adam Lerer, Adam Richardson, Ahmed El-Kishky, Aiden Low, Alec Helyar, Aleksander Madry, Alex Beutel, Alex Carney, et al. Openai o1 system card. arXiv preprint arXiv:2412.16720, 2024.',
 '56': 'The references section continues listing the following: Ke Ji, Jiahao Xu, Tian Liang, Qiuzhi Liu, Zhiwei He, Xingyu Chen, Xiaoyuan Liu, Zhijie Wang, Junying Chen, Benyou Wang, et al. The first few tokens are all you need: An efficient and effective unsupervised prefix fine-tuning method for reasoning models.arXiv preprint arXiv:2503.02875, 2025.',
 '57': 'The references section continues listing the following: Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph E. Gonzalez, Hao Zhang, and Ion Stoica. Efficient memory management for large language model serving with pagedattention. In Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles, 2023.',
 '58': 'The references section continues listing the following: Bespoke Labs. Bespoke-stratos: The unreasonable effectiveness of reasoning distillation. www.bespokelabs.ai/blog/bespoke-stratos-the-unreasonable-effectiveness-of-reasoning-distillation, 2025. Accessed: 2025-01-22.',
 '59': 'The references section continues listing the following: Ayeong Lee, Ethan Che, and Tianyi Peng. How well do llms compress their own chain-of-thought? a token complexity approach. arXiv preprint arXiv:2503.01141, 2025.',
 '60': 'The references section continues listing the following: Yaniv Leviathan, Matan Kalman, and Yossi Matias. Fast inference from transformers via speculative decoding. In International Conference on Machine Learning, pp. 19274–19286. PMLR, 2023.',
 '61': 'The references section continues listing the following: Dacheng Li, Shiyi Cao, Tyler Griggs, Shu Liu, Xiangxi Mo, Eric Tang, Sumanth Hegde, Kourosh Hakhamaneshi, Shishir G. Patil, Matei Zaharia, Joseph E. Gonzalez, and Ion Stoica. Llms can easily learn to reason from demonstrations structure, not content, is what matters!, 2025a. URL https://arxiv.org/abs/2502.07374.',
 '62': 'The references section continues listing the following: Yuetai Li, Xiang Yue, Zhangchen Xu, Fengqing Jiang, Luyao Niu, Bill Yuchen Lin, Bhaskar Ramasubramanian, and Radha Poovendran. Small models struggle to learn from strong reasoners. arXiv preprint arXiv:2502.12143, 2025b.',
 '63': 'The references section continues listing the following: Zhong-Zhi Li, Duzhen Zhang, Ming-Liang Zhang, Jiaxin Zhang, Zengyan Liu, Yuxuan Yao, Haotian Xu, Junhao Zheng, Pei-Jie Wang, Xiuyi Chen, et al. From system 1 to system 2: A survey of reasoning large language models. arXiv preprint arXiv:2502.17419, 2025c.',
 '64': 'The references section continues listing the following: Baohao Liao, Yuhui Xu, Hanze Dong, Junnan Li, Christof Monz, Silvio Savarese, Doyen Sahoo, and Caiming Xiong. Reward-guided speculative decoding for efficient llm reasoning. arXiv preprint arXiv:2501.19324, 2025.',
 '65': 'The references section continues listing the following: Hunter Lightman, Vineet Kosaraju, Yura Burda, Harri Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. Let’s verify step by step. arXiv preprint arXiv:2305.20050, 2023.',
 '66': 'The references section continues listing the following: Aixin Liu, Bei Feng, Bin Wang, Bingxuan Wang, Bo Liu, Chenggang Zhao, Chengqi Dengr, Chong Ruan, Damai Dai, Daya Guo, et al. Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434, 2024.',
 '67': 'The references section continues listing the following: Runze Liu, Junqi Gao, Jian Zhao, Kaiyan Zhang, Xiu Li, Biqing Qi, Wanli Ouyang, and Bowen Zhou. Can 1b llm surpass 405b llm? rethinking compute-optimal test-time scaling, 2025a. URL https://arxiv.org/abs/2502.06703.',
 '68': 'The references section continues listing the following: Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun Lee, and Min Lin. Understanding r1-zero-like training: A critical perspective. arXiv preprint arXiv:2503.20783, 2025b.',
 '69': 'The references section continues listing the following: Zhenyan Lu, Xiang Li, Dongqi Cai, Rongjie Yi, Fangming Liu, Xiwen Zhang, Nicholas D. Lane, and Mengwei Xu. Small language models: Survey, measurements, and insights, 2025. URL https://arxiv.org/abs/2409.15790.',
 '70': 'The references section continues listing the following: Niklas Muennighoff, Zitong Yang, Weijia Shi, Xiang Lisa Li, Li Fei-Fei, Hannaneh Hajishirzi, Luke Zettlemoyer, Percy Liang, Emmanuel Candès, and Tatsunori Hashimoto. s1: Simple test-time scaling, 2025. URL https://arxiv.org/abs/2501.19393.'}

{'0': 'This is a preprint under review. The paper introduces Speculative Thinking, a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference. It focuses on the reasoning level, unlike speculative decoding, which operates at the token level. The authors are Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han, affiliated with Case Western Reserve University and Carnegie Mellon University.',
 '1': 'The paper\'s abstract explains that existing methods often require costly training and produce inefficient outputs. Speculative Thinking uses large reasoning models to guide smaller ones during inference. This is achieved by observing that reasoning-supportive tokens like "wait" often follow structural delimiters like "\\n\\n". Larger models exhibit better control over reflective behavior. The method boosts reasoning accuracy in smaller models and shortens their outputs. For example, the 1.5B model\'s accuracy on MATH500 inc

In [52]:
type(data)

dict

In [53]:
len(data)

71

In [54]:
for i in range(len(data)):
    print(data[str(i)])

This is a preprint under review. The paper introduces Speculative Thinking, a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference. It focuses on the reasoning level, unlike speculative decoding, which operates at the token level. The authors are Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han, affiliated with Case Western Reserve University and Carnegie Mellon University.
The paper's abstract explains that existing methods often require costly training and produce inefficient outputs. Speculative Thinking uses large reasoning models to guide smaller ones during inference. This is achieved by observing that reasoning-supportive tokens like "wait" often follow structural delimiters like "\n\n". Larger models exhibit better control over reflective behavior. The method boosts reasoning accuracy in smaller models and shortens their outputs. For example, the 1.5B model's accuracy on MATH500 increases from 83.2% to

In [55]:
original_data = divided.split("<sep>")
original_data 

['',
 'Preprint. Under review.\nSpeculative Thinking: Enhancing Small-Model Reasoning\nwith Large Model Guidance at Inference Time\nWang Yang1, Xiang Yue2, Vipin Chaudhary1, Xiaotian Han1\n1Case Western Reserve University 2Carnegie Mellon University\n{wxy320,vxc204,xhan}@case.edu xyue2@andrew.cmu.edu\nAbstract\nRecent advances leverage post-training to enhance model reasoning perfor-\nmance, which typically requires costly training pipelines and still suffers\nfrom inefficient, overly lengthy outputs. We introduce Speculative Think-\ning1, a training-free framework that enables large reasoning models to\nguide smaller ones during inference at the reasoning level, distinct from\nspeculative decoding, which operates at the token level. Our approach\nis based on two observations: (1) reasoning-supportive tokens such as\n“wait” frequently appear after structural delimiters like “\\n\\n”, serving as\nsignals for reflection or continuation; and (2) larger models exhibit stronger',
 '“wait” f

In [56]:
len(original_data)

71

In [57]:
meta = list(data.values())
meta

['This is a preprint under review. The paper introduces Speculative Thinking, a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference. It focuses on the reasoning level, unlike speculative decoding, which operates at the token level. The authors are Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han, affiliated with Case Western Reserve University and Carnegie Mellon University.',
 'The paper\'s abstract explains that existing methods often require costly training and produce inefficient outputs. Speculative Thinking uses large reasoning models to guide smaller ones during inference. This is achieved by observing that reasoning-supportive tokens like "wait" often follow structural delimiters like "\\n\\n". Larger models exhibit better control over reflective behavior. The method boosts reasoning accuracy in smaller models and shortens their outputs. For example, the 1.5B model\'s accuracy on MATH500 increases fro

In [58]:
len(meta)

71

In [59]:
result = []
max_len = max(len(original_data), len(meta))
for i in range(max_len):
    x = original_data[i] if i < len(original_data) else ""
    y = meta[i] if i < len(meta) else ""
    result.append(f"{x}. Document Meaning : {y}")


In [60]:
result

['. Document Meaning : This is a preprint under review. The paper introduces Speculative Thinking, a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference. It focuses on the reasoning level, unlike speculative decoding, which operates at the token level. The authors are Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han, affiliated with Case Western Reserve University and Carnegie Mellon University.',
 'Preprint. Under review.\nSpeculative Thinking: Enhancing Small-Model Reasoning\nwith Large Model Guidance at Inference Time\nWang Yang1, Xiang Yue2, Vipin Chaudhary1, Xiaotian Han1\n1Case Western Reserve University 2Carnegie Mellon University\n{wxy320,vxc204,xhan}@case.edu xyue2@andrew.cmu.edu\nAbstract\nRecent advances leverage post-training to enhance model reasoning perfor-\nmance, which typically requires costly training pipelines and still suffers\nfrom inefficient, overly lengthy outputs. We introduce Specul

In [61]:
print(f"{result[2]}")

“wait” frequently appear after structural delimiters like “\n\n”, serving as
signals for reflection or continuation; and (2) larger models exhibit stronger
control over reflective behavior, reducing unnecessary backtracking while
improving reasoning quality. By strategically delegating reflective steps
to a more capable model, our method significantly boosts the reasoning
accuracy of reasoning models while shortening their output. With the assis-
tance of the 32B reasoning model, the 1.5B model’s accuracy on MATH500
increases from 83.2% to 89.4%, marking a substantial improvement of 6.2%.
Simultaneously, the average output length is reduced from5439 tokens to
4583 tokens, representing a 15.7% decrease. Moreover, when applied to a
non-reasoning model (Qwen-2.5-7B-Instruct), our framework boosts its
accuracy from 74.0% to 81.8% on the same benchmark, achieving a relative
improvement of 7.8%.
+6.7%
-11.8%
(a) AIME
+6.2% -15.7% (b) MATH500
+8.1%
-4.0%
(c) GPQA
+5.0% -16.9% (d) AMC23. Docum

In [62]:
from langchain_core.documents import Document
docs = [Document(page_content=item) for item in result]

In [63]:
docs

[Document(metadata={}, page_content='. Document Meaning : This is a preprint under review. The paper introduces Speculative Thinking, a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference. It focuses on the reasoning level, unlike speculative decoding, which operates at the token level. The authors are Wang Yang, Xiang Yue, Vipin Chaudhary, and Xiaotian Han, affiliated with Case Western Reserve University and Carnegie Mellon University.'),
 Document(metadata={}, page_content='Preprint. Under review.\nSpeculative Thinking: Enhancing Small-Model Reasoning\nwith Large Model Guidance at Inference Time\nWang Yang1, Xiang Yue2, Vipin Chaudhary1, Xiaotian Han1\n1Case Western Reserve University 2Carnegie Mellon University\n{wxy320,vxc204,xhan}@case.edu xyue2@andrew.cmu.edu\nAbstract\nRecent advances leverage post-training to enhance model reasoning perfor-\nmance, which typically requires costly training pipelines and still

In [65]:
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [66]:
embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
vector_store = FAISS.from_documents(docs, embeddings)

In [67]:
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':4})

In [68]:
retriever.invoke("Qwen")

[Document(id='7276c387-63e0-42d2-a2e1-6a452dc8f1b1', metadata={}, page_content='and n3 = 125 for Excessive Negativity Takeover. These hyperparameters are selected to\nbalance informativeness and computational cost.\nA.3 Results of Deepseek-Distilled Qwen-2.5-7B\nWe present the accuracy and average output length of Deepseek-Distilled Qwen-2.5-7B on\nfour datasets.\n14Preprint. Under review.\n7B 7B+32B 32B\n20\n40\n60Accuracy\n48.89 53.33\n65.56\n7B 7B+32B 32B\n11000\n12000\n13000\n14000Average Length\n13250 13214\n12274\n(a) AIME\n7B 7B+32B 32B\n90\n91\n92\n93\n94Accuracy\n92.80 93.00 92.80\n7B 7B+32B 32B\n3600\n3800\n4000Average Length\n3975\n3768 3802 (b) MATH500\n7B 7B+32B 32B\n30\n40\n50\n60Accuracy\n45.45\n52.02\n61.62\n7B 7B+32B 32B\n5000\n5500\n6000Average Length\n6111\n5952\n5407\n(c) GPQA\n7B 7B+32B 32B\n90\n92\n94\n96Accuracy\n92.50 92.50\n95.00\n7B 7B+32B 32B\n5000\n6000\n7000Average Length\n6094\n5116\n7107 (d) AMC23\nFigure 8: Accuracy and average output length of models on

In [70]:
import json
from datasets import Dataset

# Load your json
with open("main_test_data.json", encoding="utf-8") as f:
    data = json.load(f)

# Flatten
rows = []
for entry in data:
    for q, g in zip(entry["question"], entry["ground_truth"]):
        rows.append({
            "question": q,
            "ground_truth": [g],   # ragas expects a list of answers
            "contexts": [],        # to be filled after retrieval
            "answer": ""           # to be filled after running RAG
        })

# Convert to HuggingFace Dataset
dataset = Dataset.from_list(rows)
print(dataset)

Dataset({
    features: ['question', 'ground_truth', 'contexts', 'answer'],
    num_rows: 123
})


In [None]:
model2 = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite", google_api_key="...")

In [76]:
from langchain_core.runnables import RunnableParallel, RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
prompt = PromptTemplate(template="""
You are a helpful assistant.
Answer ONLY from the provided transcript context.
If the context is insufficient, just say you don't know.
Context: {context}
Question: {question}
                        """, input_variables=['context', 'question'])

def format_docs(retrieved_docs):
    # Handle both list[Document] and single Document
    if not isinstance(retrieved_docs, list):
        retrieved_docs = [retrieved_docs]

    return "\n\n".join(doc.page_content for doc in retrieved_docs)

parallel_chain = RunnableParallel(
    {
    "context": retriever | RunnableLambda(format_docs),
    "question": RunnablePassthrough()
    }
)
parser = StrOutputParser()
normal_chain = parallel_chain | prompt | model2 | parser
main_chain = parallel_chain | normal_chain

In [77]:
from tqdm import tqdm

def run_rag(example):
    # 1. Retrieve docs separately (to save both docs & answer)
    retrieved_docs = retriever.invoke(example["question"])
    contexts = [doc.page_content for doc in retrieved_docs]

    # 2. Generate answer using your chain
    answer = normal_chain.invoke(example["question"])

    return {
        "contexts": contexts,
        "answer": answer
    }

# Apply to dataset
dataset_with_outputs = dataset.map(run_rag, num_proc=1)

print(dataset_with_outputs[0])

Map:   0%|          | 0/123 [00:00<?, ? examples/s]

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 50
}
].
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 52
}
].


{'question': "What is the main contribution of the paper 'Speculative Thinking'?", 'ground_truth': ['It introduces a training-free framework to enhance reasoning in small language models by using larger models for guidance during inference.'], 'contexts': ['Small reasoning models have worse reasoning performances and much longer responses.\nWe first report the accuracy and average output length for all three models. As shown\nin Figure 3, smaller models exhibit significantly lower accuracy compared to larger ones.\nInterestingly, the average output length of smaller models tends to be much longer. As\nmodel size increases, accuracy improves while outputs become more concise. To further\nunderstand this phenomenon, we analyze the average lengths of correct and incorrect\nresponses separately. We find that, across all model sizes, incorrect responses are consistently\nmuch longer than correct ones. This suggests that the overall average output length is\nheavily influenced by the proport

In [78]:
from datasets import Dataset

# Convert ground_truth from list[str] -> str
def flatten_ground_truth(example):
    example["ground_truth"] = example["ground_truth"][0]  # take the first (and only) element
    return example

dataset_ready = dataset_with_outputs.map(flatten_ground_truth)

Map:   0%|          | 0/123 [00:00<?, ? examples/s]

In [79]:
os.environ["RAGAS_TRACKING_DISABLED"] = "true"
os.environ["OPENAI_API_KEY"] = "dummy"

In [None]:
from ragas import evaluate
from ragas.metrics import (
    answer_correctness,
    context_precision,
    faithfulness,
    context_recall
)

# gemini_llm = ChatGroq(model="openai/gpt-oss-20b", temperature=0, groq_api_key = "...)
gemini_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite", google_api_key="...")

result = evaluate(
    dataset_ready,
    metrics=[answer_correctness, context_precision, faithfulness, context_recall],
    llm=gemini_llm
)
# print(result)

Evaluating:   0%|          | 0/492 [00:00<?, ?it/s]

Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 1
}
].
Retrying langchain_google_genai.chat_models._achat_with_retry.<locals>._achat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 1
}
].
Retrying langchain_google_ge

In [81]:
print(result)

{'answer_correctness': nan, 'context_precision': 0.7014, 'faithfulness': 0.7500, 'context_recall': 0.9091}


In [None]:
;