In [None]:
import pandas as pd    

# Below line from: https://stackoverflow.com/questions/50475635/loading-jsonl-file-as-json-objects
jsonObj = pd.read_json(path_or_buf='../../Data/question.jsonl', lines=True)
prompts = [jsonObj.at[i, 'turns'] for i in range(len(jsonObj))]

In [None]:
import time
import os

os.environ['HF_HOME'] = '/home/linux_user/EAGLE/eagle/model'

base_model_paths = ['yuhuili/EAGLE-Vicuna-7B-v1.3',
                    'yuhuili/EAGLE-Vicuna-13B-v1.3',
                    'yuhuili/EAGLE-Vicuna-33B-v1.3',
                    'yuhuili/EAGLE3-DeepSeek-R1-Distill-LLaMA-8B']

EAGLE_model_paths = ['lmsys/vicuna-7b-v1.3',
                     'lmsys/vicuna-13b-v1.3',
                     'lmsys/vicuna-33b-v1.3',
                     'deepseek-ai/DeepSeek-R1-Distill-Llama-8B']

# Most of Below Code From https://github.com/SafeAILab/EAGLE
from eagle.model.ea_model import EaModel
from fastchat.model import get_conversation_template
model = EaModel.from_pretrained(
    base_model_path=base_model_paths[0],
    ea_model_path=EAGLE_model_paths[0],
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto",
    total_token=-1,
    use_flash_attention_2=True,
    offload_folder="offload" # https://github.com/nomic-ai/gpt4all/issues/239
)

model.eval()
your_message="Hello"
conv = get_conversation_template("vicuna")
conv.append_message(conv.roles[0], your_message)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids=model.tokenizer([prompt]).input_ids
input_ids = torch.as_tensor(input_ids).cuda()

start = time.perf_counter_ns()

output_ids=model.eagenerate(input_ids,temperature=0.5,max_new_tokens=512)
# output=model.tokenizer.decode(output_ids[0])

finish = time.perf_counter_ns()
elapsed = finish - start
print("Wall Clock Time (ns): ", elapsed)

num_tokens = len(output_ids)
tokens_per_second = num_tokens / (elapsed * pow(10, -9))
print("Tokens Per Second: ", tokens_per_second)

In [None]:
'''

EAGLE:

@inproceedings{li2024eagle, 
	author = {Yuhui Li and Fangyun Wei and Chao Zhang and Hongyang Zhang}, 
	title = {{EAGLE}: Speculative Sampling Requires Rethinking Feature Uncertainty}, 
	booktitle = {International Conference on Machine Learning},
	year = {2024}
}
@inproceedings{li2024eagle2, 
	author = {Yuhui Li and Fangyun Wei and Chao Zhang and Hongyang Zhang}, 
	title = {{EAGLE-2}: Faster Inference of Language Models with Dynamic Draft Trees}, 
	booktitle = {Empirical Methods in Natural Language Processing},
	year = {2024}
}
@misc{li2025eagle3scalinginferenceacceleration,
      title={{EAGLE-3}: Scaling up Inference Acceleration of Large Language Models via Training-Time Test}, 
      author={Yuhui Li and Fangyun Wei and Chao Zhang and Hongyang Zhang},
      year={2025},
      eprint={2503.01840},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2503.01840}, 
}


FlashAttention:

@inproceedings{dao2022flashattention,
  title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
  author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
  booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
  year={2022}
}
@inproceedings{dao2023flashattention2,
  title={Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
  author={Dao, Tri},
  booktitle={International Conference on Learning Representations (ICLR)},
  year={2024}
}

'''