<a href="https://colab.research.google.com/github/donghuna/PromptGenerate/blob/main/generateCode-codeLlama-gsm8k-plan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install datasets



In [15]:
from datetime import datetime
import transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import load_dataset
import csv

In [23]:
from google.colab import userdata

from huggingface_hub import login
login(token=userdata.get('HUG_TOKEN'))

In [17]:
# from google.colab import drive
# drive.mount('/content/drive')


In [18]:
# tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", truncation=True)
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf", truncation=True)

pipeline = transformers.pipeline(
    "text-generation",
    # model="codellama/CodeLlama-7b-hf",
    model="codellama/CodeLlama-7b-Instruct-hf",
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
dataset = load_dataset("donghuna/gsm8k_with_plan")
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'plan'],
        num_rows: 1000
    })
})

In [20]:
# import textwrap

def generate_and_tokenize_prompt_for_generate_code(question):
    # plan = f"""{question}"""
    plan = f"""
You are a helpful and expert coding assistant. Your task is to solve math-related problems by generating Python code that produces the correct answer. Write clean, efficient, and well-commented Python code to solve the following problem:

Problem:
{question}

Requirements:
- The code should compute the answer directly.
- Include comments to explain key steps.
- Print the final answer as the output.

Provide only the Python code as the response.

Begin:
"""

    return plan

In [21]:
from tqdm import tqdm


output_file = "generated_code-onlyPrompt.csv"

with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["question", "answer", "solution"])  # CSV 헤더 작성

    # for idx, data in enumerate(dataset["train"]):
    for idx, data in enumerate(tqdm(dataset["train"], desc="Processing Data")):
        question = data["question"]
        answer = data["answer"]

        data_prompt = generate_and_tokenize_prompt_for_generate_code(question)

        input_tokens = tokenizer(data_prompt, return_tensors="pt", truncation=True, max_length=512)
        max_length = input_tokens.input_ids.shape[-1] + 400

        sequences = pipeline(
            data_prompt,
            do_sample=True,
            temperature=0.2,
            top_p=0.9,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2,
            max_length=max_length,
        )

        generated_code = sequences[0]['generated_text']

        keyword = "Begin:"
        if keyword in generated_code:
            generated_code = generated_code.split(keyword, 1)[1].strip()

        # if '```' in generated_code:
        #     generated_code = generated_code.split('```')[0]

        # print(sequences[0]['generated_text'])
        # print(generated_code)
        # print('--------------------')
        # break

        writer.writerow([question, answer, generated_code])


Processing Data:   0%|          | 0/1000 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Processing Data: 100%|██████████| 1000/1000 [1:17:24<00:00,  4.64s/it]


In [24]:
# !cp generated_code-agent.csv /content/drive/MyDrive/

import pandas as pd


df = pd.read_csv(output_file)



from datasets import Dataset
from google.colab import userdata

dataset = Dataset.from_pandas(df)

dataset.push_to_hub("donghuna/generated_code-gsm8k-only", token=userdata.get('HUG_TOKEN'))


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/donghuna/generated_code-gsm8k-only/commit/22acfdf3cef97e0a9f899df777fb2c463cc08b4a', commit_message='Upload dataset', commit_description='', oid='22acfdf3cef97e0a9f899df777fb2c463cc08b4a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/donghuna/generated_code-gsm8k-only', endpoint='https://huggingface.co', repo_type='dataset', repo_id='donghuna/generated_code-gsm8k-only'), pr_revision=None, pr_num=None)