In [None]:
!pip install accelerate bitsandbytes datasets peft transformers einops

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load the Phi 2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/phi-2",
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "/kaggle/input/train-qlora-alpaca-2048-r64",
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
)

In [None]:
from transformers import GenerationConfig, TextStreamer, pipeline

generation_config = GenerationConfig.from_pretrained("microsoft/phi-2")
generation_config.max_new_tokens = 512
generation_config.temperature = 0.8
generation_config.do_sample = True

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    generation_config=generation_config,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    streamer=streamer,
)

In [2]:
! git clone https://github.com/openai/human-eval
! pip install -e human-eval
%cd human-eval

Cloning into 'human-eval'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 29 (delta 8), reused 4 (delta 4), pack-reused 9[K
Receiving objects: 100% (29/29), 54.20 KiB | 623.00 KiB/s, done.
Resolving deltas: 100% (9/9), done.
Obtaining file:///kaggle/working/human-eval
  Preparing metadata (setup.py) ... [?25ldone
Collecting fire (from human-eval==1.0)
  Downloading fire-0.5.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.3/88.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25ldone
[?25h  Created wheel for fire: filename=fire-0.5.0-py2.py3-none-any.whl size=116934 sha256=64a8144ba196de972218903bc05285895b8f2b54eee89615ee843df2038baf97
  Stored in directory: /root/.cache/pip/wheels/90/d4/f7/9404e5db0

In [None]:
from human_eval.data import write_jsonl, read_problems

problems = read_problems()

num_samples_per_task = 3
samples = [
    dict(task_id=task_id, completion=llm(problems[task_id]["prompt"]))
    for task_id in problems
    for _ in range(num_samples_per_task)
]
write_jsonl("train_samples.jsonl", samples)

In [4]:
import json

def create_jsonl_object(task_id, completion_text):
    return {"task_id": task_id, "completion": completion_text}

def process_jsonl_file(input_file_path, output_file_path):
    # Leggi il file JSONL
    with open(input_file_path, 'r') as file:
        lines = file.readlines()

    # Parsa ogni riga e crea la lista di oggetti
    jsonl_objects = []
    for line in lines:
        json_line = json.loads(line)
        task_id = json_line['task_id']
        completion_text = json_line['completion'][0]['generated_text']  # Assumendo che ci sia solo un elemento in 'completion'
        
        jsonl_objects.append(create_jsonl_object(task_id, completion_text))

    with open(output_file_path, 'w') as output_file:
        for obj in jsonl_objects:
            output_file.write(json.dumps(obj) + '\n')

input_file_path = '/kaggle/input/train-samples/train_samples.jsonl'
output_file_path = '/kaggle/working/human-eval/train_samples_formatted.jsonl'
    
process_jsonl_file(input_file_path, output_file_path)

Nuovo file JSONL salvato in: /kaggle/working/human-eval/train_samples_formatted.jsonl


In [5]:
import json

def process_jsonl(input_file, output_file):
    with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
        for line in f_in:
            data = json.loads(line)
            completion = data['completion']
            if '__name__ == "__main__":' in completion:
                index = completion.find('if __name__ == "__main__":')
                modified_completion = completion[:index]
                data['completion'] = modified_completion
            json.dump(data, f_out)
            f_out.write('\n')

input_file = '/kaggle/working/human-eval/train_samples_formatted.jsonl'
output_file = '/kaggle/working/human-eval/train_samples_cleaned.jsonl'

process_jsonl(input_file, output_file)

In [None]:
! cat /kaggle/input/human-eval-code-repo/human-eval-master/human_eval/execution.py > /kaggle/working/human-eval/human_eval/execution.py
! cat /kaggle/input/eval-file/evaluation.py > /kaggle/working/human-eval/human_eval/evaluation.py 
! chmod +x /kaggle/working/human-eval/human_eval/evaluate_functional_correctness.py 
! python human_eval/evaluate_functional_correctness.py /kaggle/working/human-eval/train_samples_cleaned.jsonl