In [None]:
!pip install accelerate bitsandbytes datasets peft transformers einops

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load the Phi 2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/phi-2",
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "/kaggle/input/train-lora-evol-seq-len-2048-r64",
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
)

In [None]:
from transformers import GenerationConfig, TextStreamer, pipeline

generation_config = GenerationConfig.from_pretrained("microsoft/phi-2")
generation_config.max_new_tokens = 512
generation_config.temperature = 0.8
generation_config.do_sample = True

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    generation_config=generation_config,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    streamer=streamer,
)

In [None]:
# Apertura del nuovo file JSONL per la scrittura
output_file_path = "samples.jsonl"
output_file = open(output_file_path, 'w')

base_path = "/kaggle/input/human-eval-x-repo/codegeex/benchmark/humaneval-x"
languages = ["java", "js", "cpp"]

jsonl_files = [
    f"{base_path}/{language}/data/humaneval_{language}.jsonl"
    for language in languages
]

for jsonl_file in jsonl_files:
    with open(jsonl_file, 'r') as input_file:
        for _ in range(83):
            line = input_file.readline()
            if not line:
                break  
            output_file.write(line)

output_file.close()

In [None]:
import json

file_path = "samples.jsonl"
output_file_path = "generated_samples.jsonl"
num_samples_per_task = 3
data = []

with open(file_path, 'r') as file:
    for line in file:
        json_data = json.loads(line)
        for _ in range(num_samples_per_task):
            answer = llm(json_data['prompt'])[0]['generated_text']
            generated_sample = {
                "task_id": json_data['task_id'],
                "generation": answer
            }
            data.append(generated_sample)
        
with open(output_file_path, 'w') as output_file:
    for json_data in data:
        output_file.write(json.dumps(json_data) + '\n')