In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from modeling_llama import LlamaForCausalLM
from transformers import AutoTokenizer
from decoding import clip_input, infer_input_ids
import numpy as np

In [None]:
model_path = './CodeLlama-13b/'
torch.nn.Linear.reset_parameters = lambda x: None
model = LlamaForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
device='cuda:2'
model = model.to(device).eval()

In [None]:
_attn_skip_layer_id_set, _mlp_skip_layer_id_set =model.get_skip_layers()
print(_attn_skip_layer_id_set, _mlp_skip_layer_id_set)

In [None]:
seed=42
torch.manual_seed(seed)
np.random.seed(seed)

In [None]:
from human_eval.data import write_jsonl
from tqdm import tqdm
def filter_code(completion: str) -> str:
    # The program tends to overwrite, we only take the first function
    completion = completion.lstrip("\n")
    return completion.split("\n\n")[0]

def fix_indents(text: str) -> str:
    return text.replace("\t", "    ")

In [None]:
from datasets import load_dataset
from human_eval.data import read_problems
n_shot = 0
task_name = 'humaneval'
prompt_shots = ''
if task_name == 'xsum':
    data = load_dataset('xsum', split='test').shuffle(seed=seed).select(range(1000))
    shots = load_dataset('xsum',split='train').shuffle(seed=seed).select(range(n_shot))
    prompt_keys=['document','summary']
elif task_name == 'cnndm':
    data = load_dataset('cnn_dailymail', name='3.0.0', split='test') .shuffle(seed=seed).select(range(1000))
    shots = load_dataset('cnn_dailymail', name='3.0.0', split='train').shuffle(seed=seed).select(range(n_shot))
    prompt_keys=['article','highlights']
elif task_name == 'humaneval':
    data = read_problems() # adopt hf load humaneval have problems
    
for i in range(n_shot):
    prompt = 'Article: ' + shots[i][prompt_keys[0]] + '\nSummary: ' + shots[i][prompt_keys[1]].replace('\n', '') + '\n'
    prompt_shots += prompt

In [None]:
print(prompt_shots)

In [None]:

main_metrics = {'time_base':[], 'time_sss':[], 'time_sss_autoth1':[], 'time_sss_autoth2':[],
                'token_time_base':[], 'token_time_sss':[], 'token_time_sss_autoth1':[], 'token_time_sss_autoth2':[],
                'matchness_sss':[],'num_drafted_tokens_sss':[],
                'matchness_sss_autoth1':[],'num_drafted_tokens_sss_autoth1':[],
                'matchness_sss_autoth2':[],'num_drafted_tokens_sss_autoth2':[],
                }
num_samples_per_task = 10
pbar = tqdm(total=len(data) * num_samples_per_task)
out_path = {'base': './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_base-k0-p0.95-t0.6_rth1.00_double_check2.jsonl',
            'sss':  './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_sss-k0-p0.95-t0.6_rth1.00_th0.6_double_check2.jsonl',
            'sss_autoth1':  './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_sss-k0-p0.95-t0.6_rth1.00_autoth0.6-st1-mat0.92-var1e-2--mom1-0.5-mom2-0.9_double_check2.jsonl',
            'sss_autoth2':  './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_sss-k0-p0.95-t0.6_rth1.00_autoth0.6-st1-mat0.95-var1e-2--mom1-0.5-mom2-0.9_double_check2.jsonl',
            }
samples = {'base': [], 'sss': [], 'sss_autoth1': [], 'sss_autoth2': []}
with open('HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_sss-k0-p0.95-t0.6_rth1.00_autoth0.6-st1-mat0.92-0.95-var1e-2-mom1-0.5-mom2-0.9_double_check2.txt', 'w') as f:
    for i,task_id in enumerate(data):
        input_ids = clip_input(tokenizer, data[task_id], task_name)
        f.write(f'prompt length: {input_ids.shape[1]} \n')
        for n in range(num_samples_per_task):
            if i == 0 and n == 0:
                th_stop_draft_sss = 0.6
                th_stop_draft_sss_autoth1  = 0.6
                th_stop_draft_sss_autoth2  = 0.6
            else:
                th_stop_draft_sss = result_sss['th_stop_draft']
                th_stop_draft_sss_autoth1 = result_sss_autoth1['th_stop_draft']
                th_stop_draft_sss_autoth2 = result_sss_autoth2['th_stop_draft']
            f.write('sss th: {:.4f}, sss autoth1: {:.4f}, sss autoth2: {:.4f} \n'.format(th_stop_draft_sss, th_stop_draft_sss_autoth1, th_stop_draft_sss_autoth2))
            result_base = infer_input_ids(model, tokenizer, input_ids, generate_fn='base',seed=n, 
                        max_new_tokens=512, do_sample=True, early_stop=True, 
                        top_k=0, top_p=0.95, temperature=0.6)
            result_sss = infer_input_ids(model, tokenizer, input_ids, generate_fn='sss', seed=n,
                        max_new_tokens=512, early_stop=True, max_step_draft=12,
                        th_stop_draft=th_stop_draft_sss, th_random_draft=1.0, auto_th_stop_draft=False,
                        do_sample=True, do_sample_draft=True, top_k=0, top_p=0.95, temperature=0.6)
            result_sss_autoth1 = infer_input_ids(model, tokenizer, input_ids, generate_fn='sss', seed=n,
                        max_new_tokens=512, early_stop=True,max_step_draft=12,
                        th_stop_draft=th_stop_draft_sss_autoth1, th_random_draft=1.0, auto_th_stop_draft=True, auto_parameters=[1,0.5,0.92,1e-2,0.9],
                        do_sample=True, do_sample_draft=True, top_k=0, top_p=0.95, temperature=0.6)
            result_sss_autoth2 = infer_input_ids(model, tokenizer, input_ids, generate_fn='sss', seed=n,
                        max_new_tokens=512, early_stop=True,max_step_draft=12,
                        th_stop_draft=th_stop_draft_sss_autoth2, th_random_draft=1.0, auto_th_stop_draft=True, auto_parameters=[1,0.5,0.95,1e-2,0.9],
                        do_sample=True, do_sample_draft=True, top_k=0, top_p=0.95, temperature=0.6)
            
            results = [
                ('base', result_base),
                ('sss', result_sss),
                ('sss_autoth1', result_sss_autoth1),
                ('sss_autoth2', result_sss_autoth2),
            ]

            for key, result in results:
                main_metrics['time_' + key].append(result['time'])
                main_metrics['token_time_' + key].append(result['time'] / result['generate_ids'].shape[1])
                
                if key != 'base':
                    main_metrics['matchness_' + key].append(result['matchness'])
                    main_metrics['num_drafted_tokens_' + key].append(result['num_drafted_tokens'])
                sample=filter_code(fix_indents(result['completion']))
                samples[key]+= [{'task_id': task_id, 'completion': sample}]
                # f.write(f'{key}, completion: \n{sample}\n')

            metric = {
                'mean time base tem 0.6':np.mean(main_metrics['time_base']),
                'mean time sss tem 0.6':np.mean(main_metrics['time_sss']),
                'mean time sss autoth rth 1.00 tem 0.6 mat 0.85':np.mean(main_metrics['time_sss_autoth1']),
                'mean time sss autoth rth 1.00 tem 0.6 mat 0.90':np.mean(main_metrics['time_sss_autoth2']),
                'E2E mean speed up sss tem 0.6':np.mean(main_metrics['time_base'])/np.mean(main_metrics['time_sss']),
                'E2E mean speed up sss autoth rth 1.00 tem 0.6 mat 0.85':np.mean(main_metrics['time_base'])/np.mean(main_metrics['time_sss_autoth1']),
                'E2E mean speed up sss autoth rth 1.00 tem 0.6 mat 0.90':np.mean(main_metrics['time_base'])/np.mean(main_metrics['time_sss_autoth2']),
                'E2E mean token speed up sss tem 0.6':np.mean(main_metrics['token_time_base'])/np.mean(main_metrics['token_time_sss']),  
                'E2E mean token speed up sss autoth rth 1.00 tem 0.6 mat 0.85':np.mean(main_metrics['token_time_base'])/np.mean(main_metrics['token_time_sss_autoth1']), 
                'E2E mean token speed up sss autoth rth 1.00 tem 0.6 mat 0.90':np.mean(main_metrics['token_time_base'])/np.mean(main_metrics['token_time_sss_autoth2']),      
                'mean matchness sss tem 0.6':np.mean(main_metrics['matchness_sss']),
                'mean matchness sss autoth rth 1.00 tem 0.6 mat 0.85':np.mean(main_metrics['matchness_sss_autoth1']),
                'mean matchness sss autoth rth 1.00 tem 0.6 mat 0.90':np.mean(main_metrics['matchness_sss_autoth2']),
                'mean num_drafted_tokens sss tem 0.6':np.mean(main_metrics['num_drafted_tokens_sss']),
                'mean num_drafted_tokens sss autoth rth 1.00 tem 0.6 mat 0.85':np.mean(main_metrics['num_drafted_tokens_sss_autoth1']),
                'mean num_drafted_tokens sss autoth rth 1.00 tem 0.6 mat 0.90':np.mean(main_metrics['num_drafted_tokens_sss_autoth2']),
            }
            for key, value in metric.items():
                if isinstance(value, float):
                    metric[key] = f"{value:.4f}"
            # task_id = x['task_id']
            f.write(f'{task_id},sample{n},{metric} \n')
            f.flush()
        pbar.update(num_samples_per_task)
    for key in out_path.keys():
        write_jsonl(out_path[key], samples[key])  

In [None]:

main_metrics = {'time_base':[], 'time_sss':[], 'time_sss_autoth1':[], 'time_sss_autoth2':[], 'time_sss_autoth3':[],
                'token_time_base':[], 'token_time_sss':[], 'token_time_sss_autoth1':[], 'token_time_sss_autoth2':[], 'token_time_sss_autoth3':[],
                'matchness_sss':[],'num_drafted_tokens_sss':[],
                'matchness_sss_autoth1':[],'num_drafted_tokens_sss_autoth1':[],
                'matchness_sss_autoth2':[],'num_drafted_tokens_sss_autoth2':[],
                'matchness_sss_autoth3':[],'num_drafted_tokens_sss_autoth3':[],
                }
num_samples_per_task = 10
pbar = tqdm(total=len(data) * num_samples_per_task)
out_path = {'base': './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_base-k0-p0.95-t0.6_rth1.00.jsonl',
            'sss':  './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_sss-k0-p0.95-t0.6_rth1.00_th0.6.jsonl',
            'sss_autoth1':  './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_sss-k0-p0.95-t0.6_rth0.95_autoth0.6-st1-mat0.85-var1e-2--mom1-0.5-mom2-0.9.jsonl',
            'sss_autoth2':  './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_sss-k0-p0.95-t0.6_rth1.00_autoth0.6-st1-mat0.85-var1e-2--mom1-0.5-mom2-0.9_v2.jsonl',
            'sss_autoth3':  './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_sss-k0-p0.95-t0.6_rth1.00_autoth0.6-st1-mat0.90-var1e-2--mom1-0.5-mom2-0.9_v2.jsonl',
            }
samples = {'base': [], 'sss': [], 'sss_autoth1': [], 'sss_autoth2': [], 'sss_autoth3': []}
with open('HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_sss-k0-p0.95-t0.6_rth0.95-1.00_autoth0.6-st1-mat0.85-0.90-var1e-2-mom1-0.5-mom2-0.9.txt', 'w') as f:
    for i,task_id in enumerate(data):
        input_ids = clip_input(tokenizer, data[task_id], task_name)
        f.write(f'prompt length: {input_ids.shape[1]} \n')
        for n in range(num_samples_per_task):
            if i == 0 and n == 0:
                th_stop_draft_sss = 0.6
                th_stop_draft_sss_autoth1  = 0.6
                th_stop_draft_sss_autoth2  = 0.6
                th_stop_draft_sss_autoth3  = 0.6
            else:
                th_stop_draft_sss = result_sss['th_stop_draft']
                th_stop_draft_sss_autoth1 = result_sss_autoth1['th_stop_draft']
                th_stop_draft_sss_autoth2 = result_sss_autoth2['th_stop_draft']
                th_stop_draft_sss_autoth3 = result_sss_autoth3['th_stop_draft']
            f.write('sss th: {:.4f}, sss autoth1: {:.4f}, sss autoth2: {:.4f}, sss autoth3: {:.4f} \n'.format(th_stop_draft_sss, th_stop_draft_sss_autoth1, th_stop_draft_sss_autoth2, th_stop_draft_sss_autoth3))
            result_base = infer_input_ids(model, tokenizer, input_ids, generate_fn='base',seed=n, 
                        max_new_tokens=512, do_sample=True, early_stop=True, 
                        top_k=0, top_p=0.95, temperature=0.6)
            result_sss = infer_input_ids(model, tokenizer, input_ids, generate_fn='sss', seed=n,
                        max_new_tokens=512, early_stop=True, max_step_draft=12,
                        th_stop_draft=th_stop_draft_sss, th_random_draft=1.0, auto_th_stop_draft=False,
                        do_sample=True, do_sample_draft=True, top_k=0, top_p=0.95, temperature=0.6)
            result_sss_autoth1 = infer_input_ids(model, tokenizer, input_ids, generate_fn='sss', seed=n,
                        max_new_tokens=512, early_stop=True,max_step_draft=12,
                        th_stop_draft=th_stop_draft_sss_autoth1, th_random_draft=0.95, auto_th_stop_draft=True, auto_parameters=[1,0.5,0.85,1e-2,0.9],
                        do_sample=True, do_sample_draft=True, top_k=0, top_p=0.95, temperature=0.6)
            result_sss_autoth2 = infer_input_ids(model, tokenizer, input_ids, generate_fn='sss', seed=n,
                        max_new_tokens=512, early_stop=True,max_step_draft=12,
                        th_stop_draft=th_stop_draft_sss_autoth2, th_random_draft=1.0, auto_th_stop_draft=True, auto_parameters=[1,0.5,0.85,1e-2,0.9],
                        do_sample=True, do_sample_draft=True, top_k=0, top_p=0.95, temperature=0.6)
            result_sss_autoth3 = infer_input_ids(model, tokenizer, input_ids, generate_fn='sss', seed=n,
                        max_new_tokens=512, early_stop=True,max_step_draft=12,
                        th_stop_draft=th_stop_draft_sss_autoth3, th_random_draft=1.0, auto_th_stop_draft=True, auto_parameters=[1,0.5,0.90,1e-2,0.9],
                        do_sample=True, do_sample_draft=True, top_k=0, top_p=0.95, temperature=0.6)
            
            results = [
                ('base', result_base),
                ('sss', result_sss),
                ('sss_autoth1', result_sss_autoth1),
                ('sss_autoth2', result_sss_autoth2),
                ('sss_autoth3', result_sss_autoth3),
            ]

            for key, result in results:
                main_metrics['time_' + key].append(result['time'])
                main_metrics['token_time_' + key].append(result['time'] / result['generate_ids'].shape[1])
                
                if key != 'base':
                    main_metrics['matchness_' + key].append(result['matchness'])
                    main_metrics['num_drafted_tokens_' + key].append(result['num_drafted_tokens'])
                sample=filter_code(fix_indents(result['completion']))
                samples[key]+= [{'task_id': task_id, 'completion': sample}]
                # f.write(f'{key}, completion: \n{sample}\n')

            metric = {
                'mean time base tem 0.6':np.mean(main_metrics['time_base']),
                'mean time sss tem 0.6':np.mean(main_metrics['time_sss']),
                'mean time sss autoth rth 0.95 tem 0.6 mat 0.85':np.mean(main_metrics['time_sss_autoth1']),
                'mean time sss autoth rth 1.00 tem 0.6 mat 0.85':np.mean(main_metrics['time_sss_autoth2']),
                'mean time sss autoth rth 1.00 tem 0.6 mat 0.90':np.mean(main_metrics['time_sss_autoth3']),
                'E2E mean speed up sss tem 0.6':np.mean(main_metrics['time_base'])/np.mean(main_metrics['time_sss']),
                'E2E mean speed up sss autoth rth 0.95 tem 0.6 mat 0.85':np.mean(main_metrics['time_base'])/np.mean(main_metrics['time_sss_autoth1']),
                'E2E mean speed up sss autoth rth 1.00 tem 0.6 mat 0.85':np.mean(main_metrics['time_base'])/np.mean(main_metrics['time_sss_autoth2']),
                'E2E mean speed up sss autoth rth 1.00 tem 0.6 mat 0.90':np.mean(main_metrics['time_base'])/np.mean(main_metrics['time_sss_autoth3']),
                'E2E mean token speed up sss tem 0.6':np.mean(main_metrics['token_time_base'])/np.mean(main_metrics['token_time_sss']),
                'E2E mean token speed up sss autoth rth 0.95 tem 0.6 mat 0.85':np.mean(main_metrics['token_time_base'])/np.mean(main_metrics['token_time_sss_autoth1']),   
                'E2E mean token speed up sss autoth rth 1.00 tem 0.6 mat 0.85':np.mean(main_metrics['token_time_base'])/np.mean(main_metrics['token_time_sss_autoth2']), 
                'E2E mean token speed up sss autoth rth 1.00 tem 0.6 mat 0.90':np.mean(main_metrics['token_time_base'])/np.mean(main_metrics['token_time_sss_autoth3']),      
                'mean matchness sss tem 0.6':np.mean(main_metrics['matchness_sss']),
                'mean matchness sss autoth rth 0.95 tem 0.6 mat 0.85':np.mean(main_metrics['matchness_sss_autoth1']),
                'mean matchness sss autoth rth 1.00 tem 0.6 mat 0.85':np.mean(main_metrics['matchness_sss_autoth2']),
                'mean matchness sss autoth rth 1.00 tem 0.6 mat 0.90':np.mean(main_metrics['matchness_sss_autoth3']),
                'mean num_drafted_tokens sss tem 0.6':np.mean(main_metrics['num_drafted_tokens_sss']),
                'mean num_drafted_tokens sss autoth rth 0.95 tem 0.6 mat 0.85':np.mean(main_metrics['num_drafted_tokens_sss_autoth1']),
                'mean num_drafted_tokens sss autoth rth 1.00 tem 0.6 mat 0.85':np.mean(main_metrics['num_drafted_tokens_sss_autoth2']),
                'mean num_drafted_tokens sss autoth rth 1.00 tem 0.6 mat 0.90':np.mean(main_metrics['num_drafted_tokens_sss_autoth3']),
            }
            for key, value in metric.items():
                if isinstance(value, float):
                    metric[key] = f"{value:.4f}"
            # task_id = x['task_id']
            f.write(f'{task_id},sample{n},{metric} \n')
            f.flush()
        pbar.update(num_samples_per_task)
    for key in out_path.keys():
        write_jsonl(out_path[key], samples[key])  

In [None]:
import json
results_name = './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_base-k0-p0.95-t0.6_rth1.00_double_check.jsonl'
results_name2 = './HumanEval_13b-code_0shot_sample10_bayesian_maxtoken512_maxstep12_base-k0-p0.95-t0.6_rth1.00_double_check_output.jsonl'

input_filename = results_name
output_filename = results_name2

with open(input_filename, 'r') as input_file, open(output_filename, 'w') as output_file:
    for line in input_file:
        try:
            data = json.loads(line)
            completion = data.get("completion", "")
            data["completion"] = "    "+completion
            output_file.write(json.dumps(data) + '\n')
        except json.JSONDecodeError:
            print(f"Error decoding JSON: {line}")



In [None]:
git clone https://github.com/openai/human-eval
pip install -e human-eval
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
!evaluate_functional_correctness result_name2