In [1]:
import os
import pandas as pd
import json
import re
import google.generativeai as genai
from dotenv import load_dotenv

In [None]:
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

model = genai.GenerativeModel("models/gemini-1.5-flash")


In [None]:
data = [
    {
        "prompt": "Write a Python function to add two numbers.",
        "completion": "def add(a, b):\n    return a + b"
    },
    {
        "prompt": "Write a Python function to check if a number is even.",
        "completion": "def is_even(n):\n    return n % 2 == 0"
    },
    {
        "prompt": "Write a Python function to find factorial.",
        "completion": "def factorial(n):\n    return 1 if n == 0 else n * factorial(n-1)"
    },
]

df = pd.DataFrame(data)
df.to_csv("code_generation_dataset.csv", index=False)
print("\n📦 Dataset:\n", df.head())


📦 Dataset:
                                               prompt  \
0        Write a Python function to add two numbers.   
1  Write a Python function to check if a number i...   
2         Write a Python function to find factorial.   

                                          completion  
0                   def add(a, b):\n    return a + b  
1             def is_even(n):\n    return n % 2 == 0  
2  def factorial(n):\n    return 1 if n == 0 else...  


In [None]:

def clean_text(prompt, completion):
    prompt = prompt.strip().capitalize()
    completion = completion.strip()

    completion = completion.replace("    ", "\t")

    wrapped_completion = f"```python\n{completion}\n```"

    return prompt, wrapped_completion

def save_jsonl(df, filename):
    with open(filename, "w") as f:
        for _, row in df.iterrows():
            prompt, completion = clean_text(row["prompt"], row["completion"])
            entry = {
                "messages": [
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": completion}
                ]
            }
            f.write(json.dumps(entry) + "\n")

df["prompt"] = df["prompt"].str.strip()
df["completion"] = df["completion"].str.strip()

train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

save_jsonl(train_df, "train.jsonl")
save_jsonl(test_df, "test.jsonl")


In [None]:
def print_jsonl(filename):
    print(f"\n📂 Contents of {filename}:")
    with open(filename, "r") as f:
        for line in f:
            print(line.strip())

print_jsonl("train.jsonl")
print_jsonl("test.jsonl")


📂 Contents of train.jsonl:
{"messages": [{"role": "user", "content": "Write a python function to add two numbers."}, {"role": "assistant", "content": "```python\ndef add(a, b):\n\treturn a + b\n```"}]}
{"messages": [{"role": "user", "content": "Write a python function to check if a number is even."}, {"role": "assistant", "content": "```python\ndef is_even(n):\n\treturn n % 2 == 0\n```"}]}

📂 Contents of test.jsonl:
{"messages": [{"role": "user", "content": "Write a python function to find factorial."}, {"role": "assistant", "content": "```python\ndef factorial(n):\n\treturn 1 if n == 0 else n * factorial(n-1)\n```"}]}
