In [None]:
import pdfplumber
import ollama
import json
from tqdm import tqdm

In [15]:
# 1. Extract text from the PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# 2. Split the text into smaller chunks
def split_text(text, max_chunk_length=100):
    paragraphs = text.split("\n")
    chunks, current_chunk = [], ""

    for para in paragraphs:
        if len(current_chunk) + len(para) < max_chunk_length:
            current_chunk += para + "\n"
        else:
            chunks.append(current_chunk.strip())
            current_chunk = para + "\n"

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# 3. Generate instruction and response using Ollama
def generate_instruction_response(chunk, model="llama3"):
    prompt = f"""
You are a data generator for fine-tuning a language model.

Given the content below, create:
1. A short and objective question (instruction), under 10 words, based only on the content.
2. A direct and concise answer (response), under 10 words, based only on the content.

Do not add extra explanations. Use the following format:

INSTRUCTION: <short question>
RESPONSE: <short answer>

Content:
\"\"\"
{chunk}
\"\"\"
"""
    response = ollama.chat(model=model, messages=[{"role": "user", "content": prompt}])
    content = response["message"]["content"]

    try:
        instruction = content.split("INSTRUCTION:")[1].split("RESPONSE:")[0].strip()
        answer = content.split("RESPONSE:")[1].strip()
        return instruction, answer
    except:
        print("⚠️ Failed to extract instruction and response.")
        return None, None

# 4. Save to a JSONL file
def save_to_jsonl(pairs, output_file="dataset.json"):
    with open(output_file, "w", encoding="utf-8") as f:
        for instruction, answer in pairs:
            if instruction and answer:
                example = {
                    "question": instruction,
                    "answer": answer
                }
                f.write(json.dumps(example, ensure_ascii=False) + "\n")

# Main routine with progress bar
def generate_dataset(pdf_path, model="llama3", output_file="dataset.jsonl", max_chunks=None):
    print("📄 Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)
    chunks = split_text(text)

    if max_chunks:
        chunks = chunks[:max_chunks]

    print(f"🧠 Generating instruction-response pairs using model: {model}")
    pairs = []
    for chunk in tqdm(chunks, desc="Generating pairs", unit="chunk"):
        instruction, answer = generate_instruction_response(chunk, model)
        if instruction and answer:
            pairs.append((instruction, answer))

    save_to_jsonl(pairs, output_file)
    print(f"\n✅ Dataset saved to: {output_file} ({len(pairs)} examples generated)")

# Usage example
if __name__ == "__main__":
    for i in [10]:
        #C:\Users\thomm\OneDrive\Desktop\Repositorios\conecta2ai\TinyGPT-SLM\data
        generate_dataset("C:\\Users\\thomm\\OneDrive\\Desktop\\Repositorios\\conecta2ai\\TinyGPT-SLM\data\\t-cross-2025-owners-manual-rhd-uk-australia.pdf", model="llama3.2", output_file=f"dataset_{i}.json", max_chunks=i)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

🧠 Generating instruction-response pairs using model: llama3.2


Generating pairs: 100%|██████████| 10/10 [01:03<00:00,  6.32s/chunk]


✅ Dataset saved to: dataset_10.json (10 examples generated)



