# **t5-small**

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset

# -------------------------------
# 1. Load dataset (from JSONL)
# -------------------------------
# Make sure you already generated cadquery_train.jsonl with the dataset script
# Each line should look like:
# {"input": "Design a cylinder ...", "output": "{\"cyl_id\": 100, \"cyl_len\": 200, \"P\": 1.0}"}

dataset = load_dataset("json", data_files="/content/cadquery_dataset.jsonl", split="train")

# -------------------------------
# 2. Tokenizer + model
# -------------------------------
model_name = "t5-small"   # small & cheap for prototyping
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    # Encode natural-language prompt as input
    inputs = tokenizer(batch["prompt"], padding="max_length", truncation=True, max_length=64)

    # Encode JSON params as output
    labels = tokenizer(batch["params"], padding="max_length", truncation=True, max_length=64)

    inputs["labels"] = labels["input_ids"]
    return inputs

dataset = dataset.map(preprocess, batched=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# -------------------------------
# 3. Training setup
# -------------------------------
training_args = TrainingArguments(
    output_dir="./cadquery_llm",
    eval_strategy="no",
    per_device_train_batch_size=8,
    num_train_epochs=10,
    save_strategy="epoch",
    logging_steps=50,
    learning_rate=5e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# -------------------------------
# 4. Train + Save model
# -------------------------------
trainer.train()

model.save_pretrained("./cadquery_llm")
tokenizer.save_pretrained("./cadquery_llm")

print("✅ Fine-tuned model saved to ./cadquery_llm")

KeyboardInterrupt: 

# **prompt (text to .step)**

In [None]:
import cadquery as cq
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json
import re

# Load fine-tuned model
model_name = "./cadquery_llm"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def prompt_to_params(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=64)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip() # Strip whitespace

    print(f"Model raw output: {decoded}") # Add print for debugging

    try:
        # Try to parse the output directly as JSON
        params = json.loads(decoded)
        return params
    except json.JSONDecodeError:
        # If direct parsing fails, try adding curly braces and parsing again
        try:
            json_string_with_braces = "{" + decoded + "}"
            params = json.loads(json_string_with_braces)
            return params
        except json.JSONDecodeError:
            # If both parsing attempts fail, raise an error
            raise ValueError("Could not parse model output as JSON")


def build_cylinder(params):
    yield_strength = 205
    fos = 5
    P = params["P"]
    cyl_id = params["cyl_id"]
    cyl_len = params["cyl_len"]

    cyl_thk = round(fos*(P*cyl_id)/(2*yield_strength) + 1)
    cyl_od = cyl_id + 2 * cyl_thk

    cylinder = (
        cq.Workplane("XY")
        .circle(cyl_od / 2)
        .circle(cyl_id / 2)
        .extrude(cyl_len)
    )
    return cylinder

# Usage - Simplified prompt
prompt = "Design a cylinder with bore 120 mm, length 250 mm, operating at 1.2 MPa"
params = prompt_to_params(prompt)
print("Params:", params)

model_cad = build_cylinder(params)
cq.exporters.export(model_cad, "generated_cylinder.step")
print("Exported STEP file!")

Model raw output: "cyl_id": 120, "cyl_len": 250, "P": 1.2
Params: {'cyl_id': 120, 'cyl_len': 250, 'P': 1.2}
Exported STEP file!


# **user input prompt**

In [None]:
import cadquery as cq
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json
import os
import re

# -------------------------------
# Load fine-tuned model
# -------------------------------
model_name = "./cadquery_llm"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

DATASET_FILE = "cadquery_train.jsonl"

# -------------------------------
# Convert text → params
# -------------------------------
def prompt_to_params(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=64)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip() # Strip whitespace

    print(f"Model raw output: {decoded}") # Add print for debugging

    # Clean and parse the raw output more robustly
    cleaned_output = {}
    # Use regex to find key-value pairs and handle repetition
    matches = re.findall(r'"(cyl_id|cyl_len|P)":\s*(.*?)(?:,\s*|$)', decoded)
    for key, value_str in matches:
        # Clean up value string (remove extra quotes, units, etc.)
        cleaned_value_str = value_str.strip().strip('"')
        # Try to convert to appropriate type
        try:
            if key == "P":
                # Attempt to extract numerical value and convert to float
                num_match = re.search(r'(\d+\.?\d*)', cleaned_value_str)
                if num_match:
                    cleaned_output[key] = float(num_match.group(1))
                else:
                     # If no number found, try converting the whole string, might fail
                    cleaned_output[key] = float(cleaned_value_str)
            elif key in ["cyl_id", "cyl_len"]:
                 # Attempt to extract numerical value and convert to int
                num_match = re.search(r'(\d+)', cleaned_value_str)
                if num_match:
                     cleaned_output[key] = int(num_match.group(1))
                else:
                     # If no number found, try converting the whole string, might fail
                    cleaned_output[key] = int(cleaned_value_str)
            else:
                cleaned_output[key] = cleaned_value_str # Keep as string if other key
        except ValueError:
            print(f"Warning: Could not convert value '{cleaned_value_str}' for key '{key}' to expected type. Keeping as string.")
            cleaned_output[key] = cleaned_value_str # Keep as string if conversion fails


    # Ensure all expected keys are present, potentially with default values or raise error
    expected_keys = ["cyl_id", "cyl_len", "P"]
    if not all(key in cleaned_output for key in expected_keys):
         raise ValueError(f"Model output is missing expected parameters. Found: {cleaned_output}. Expected: {expected_keys}")

    return cleaned_output


# -------------------------------
# CadQuery cylinder builder
# -------------------------------
def build_cylinder(params):
    yield_strength = 205
    fos = 5
    # Ensure P is a number before calculation
    if not isinstance(params["P"], (int, float)):
         raise ValueError(f"Pressure value is not a number: {params['P']}")
    P = params["P"]
    cyl_id = params["cyl_id"]
    cyl_len = params["cyl_len"]

    cyl_thk = round(fos * (P * cyl_id) / (2 * yield_strength) + 1)
    cyl_od = cyl_id + 2 * cyl_thk

    cylinder = (
        cq.Workplane("XY")
        .circle(cyl_od / 2)
        .circle(cyl_id / 2)
        .extrude(cyl_len)
    )
    return cylinder

# -------------------------------
# Append training sample to dataset
# -------------------------------
def log_training_sample(prompt: str, params: dict, filename=DATASET_FILE):
    # Convert numeric values back to string for logging to match dataset format if needed
    # For now, assuming params should be logged as the parsed dictionary
    sample = {"prompt": prompt, "params": json.dumps(params)}
    with open(filename, "a") as f:
        f.write(json.dumps(sample) + "\n")
    print(f"📂 Logged sample → {filename}")

# -------------------------------
# Interactive loop
# -------------------------------
if __name__ == "__main__":
    # Check if the dataset file exists, if not, create it with an empty line
    if not os.path.exists(DATASET_FILE):
        with open(DATASET_FILE, "w") as f:
            pass # Create an empty file

    while True:
        try:
            prompt = input("\nEnter a design prompt (or 'quit'): ")
            if prompt.lower() in ["quit", "exit"]:
                break

            # LLM → params
            params = prompt_to_params(prompt)
            print("🔧 Params:", params)

            # Build CAD model
            model_cad = build_cylinder(params)
            cq.exporters.export(model_cad, "generated.step")
            print("✅ Exported STEP file: generated.step")

            # Save training data
            log_training_sample(prompt, params)

        except ValueError as ve:
            print(f"⚠️ Error parsing or processing parameters: {ve}")
        except Exception as e:
            print(f"⚠️ An unexpected error occurred: {e}")

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-2703667964.py", line 112, in <cell line: 0>
    prompt = input("\nEnter a design prompt (or 'quit'): ")
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 1177, in raw_input
    return self._input_request(
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 1219, in _input_request
    raise KeyboardInterrupt("Interrupted by user") from None
KeyboardInterrupt: Interrupted by user

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback

TypeError: object of type 'NoneType' has no len()

# **generate json data**

In [None]:
import json
import random

def generate_prompt_and_code():
    # Random parameters
    bore = random.randint(50, 200)   # mm
    stroke = random.randint(50, 500) # mm
    pressure_bar = random.randint(2, 20) # bar
    pressure_mpa = round(pressure_bar * 0.1, 2) # 1 bar = 0.1 MPa

    # Random wording variety
    templates = [
        f"Generate a cylinder with bore {bore} mm, stroke length {stroke} mm, and pressure {pressure_bar} bar.",
        f"Design a hydraulic cylinder with ID {bore} mm, length {stroke} mm, and pressure {pressure_mpa} MPa.",
        f"Make a cylinder: bore {bore} mm, stroke {stroke} mm, working pressure {pressure_bar} bar.",
        f"Create a cylinder model where the internal diameter is {bore} mm, stroke {stroke} mm, pressure {pressure_mpa} MPa."
    ]
    prompt = random.choice(templates)

    # CadQuery code (expected output)
    code = f"""import cadquery as cq

yield_strength = 205  # MPa
fos = 5
P = {pressure_mpa}  # MPa
cyl_id = {bore}
cyl_len = {stroke}
cyl_thk = round(fos * (P * cyl_id) / (2 * yield_strength) + 1)
cyl_od = cyl_id + 2 * cyl_thk

model = (
    cq.Workplane("XY")
    .circle(cyl_od / 2)
    .circle(cyl_id / 2)
    .extrude(cyl_len)
)
"""

    return {"prompt": prompt, "completion": code}


def generate_dataset(n=5000, filename="cadquery_dataset.jsonl"):
    with open(filename, "w") as f:
        for _ in range(n):
            sample = generate_prompt_and_code()
            f.write(json.dumps(sample) + "\n")
    print(f"✅ Dataset saved to {filename} with {n} samples.")


# Generate 5000 training samples
generate_dataset(5000)


✅ Dataset saved to cadquery_dataset.jsonl with 5000 samples.


# **json + csv**

In [None]:
import random
import json
import csv

def generate_cylinder_dataset(n_samples=5000, jsonl_file="cylinder_dataset.jsonl", csv_file="cylinder_dataset.csv"):
    # Material constants
    yield_strength = 205  # MPa
    fos = 5

    dataset = []

    for i in range(n_samples):
        # Randomize parameters
        cyl_id = random.randint(40, 200)          # mm
        cyl_len = random.randint(50, 300)         # mm
        pressure_bar = random.uniform(2, 20)      # bar
        pressure_mpa = pressure_bar * 0.1         # convert bar → MPa

        # Calculate thickness (thin cylinder hoop stress formula)
        cyl_thk = round(fos * (pressure_mpa * cyl_id) / (2 * yield_strength) + 1, 2)
        cyl_od = cyl_id + 2 * cyl_thk

        # Prompt (natural language)
        prompt = f"design a cylinder with id {cyl_id} mm, stroke length {cyl_len} mm, pressure {round(pressure_bar,2)} bar"

        # Params dict
        params = {
            "id_mm": cyl_id,
            "stroke_mm": cyl_len,
            "pressure_bar": round(pressure_bar, 2),
            "pressure_mpa": round(pressure_mpa, 3),
            "fos": fos,
            "yield_strength_mpa": yield_strength,
            "thickness_mm": cyl_thk,
            "od_mm": cyl_od
        }

        # CadQuery code string
        code = f"""
import cadquery as cq

yield_strength = {yield_strength}  # MPa
fos = {fos}
P = {round(pressure_mpa, 3)}  # MPa
cyl_id = {cyl_id}
cyl_len = {cyl_len}
cyl_thk = {cyl_thk}
cyl_od = cyl_id + 2 * cyl_thk

model = (
    cq.Workplane("XY")
    .circle(cyl_od/2)
    .circle(cyl_id/2)
    .extrude(cyl_len)
)
"""

        dataset.append({"prompt": prompt, "params": params, "code": code.strip()})

    # Save JSONL
    with open(jsonl_file, "w") as f:
        for entry in dataset:
            f.write(json.dumps(entry) + "\n")

    # Save CSV
    with open(csv_file, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["prompt", "params", "code"])
        writer.writeheader()
        for entry in dataset:
            writer.writerow(entry)

    print(f"✅ Dataset generated with {n_samples} samples")
    print(f"   JSONL file: {jsonl_file}")
    print(f"   CSV file: {csv_file}")


# Run
generate_cylinder_dataset()


✅ Dataset generated with 5000 samples
   JSONL file: cylinder_dataset.jsonl
   CSV file: cylinder_dataset.csv


# **# train llm on generated dataset** gpt2


In [None]:

import os
import json
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

# CONFIG
DATA_JSON = "cylinder_dataset.jsonl"  # must be a JSON list or HF json dataset file
BASE_MODEL = "gpt2"
OUT_DIR = "./cad_llm"
EPOCHS = 3
BATCH = 2
MAX_LENGTH = 1024   # context length for gpt2
LR = 5e-5
SEED = 42

def make_text(example):
    # Use a strict separator so model learns mapping: prompt -> code
    sep = "\n### CADQUERY CODE\n"
    # ensure fields exist
    prompt = example.get("prompt", "").strip()
    code = example.get("code", "").strip()
    # keep only prompt+sep+code
    txt = (prompt + sep + code + "\n")
    return {"text": txt}

def main():
    # load json dataset
    ds = load_dataset("json", data_files=DATA_JSON, split="train")
    # combine into single text examples with separator
    ds = ds.map(make_text, remove_columns=ds.column_names)

    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
    # gpt2 doesn't have pad token by default — set it to eos
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})

    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)

    # tokenize, with truncation to MAX_LENGTH
    def tokenize_fn(batch):
        return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH, padding="max_length")

    tok_ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=OUT_DIR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH,
        gradient_accumulation_steps=1,
        learning_rate=LR,
        save_strategy="epoch",
        eval_strategy="no",
        logging_steps=50,
        fp16=True if (os.getenv("USE_FP16", "1") == "1") else False,
        push_to_hub=False,
        save_total_limit=3
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tok_ds,
        data_collator=data_collator,
        processing_class=tokenizer # Changed from tokenizer=tokenizer
    )

    trainer.train()
    # save model + tokenizer correctly
    trainer.save_model(OUT_DIR)
    tokenizer.save_pretrained(OUT_DIR)
    print("Saved finetuned model to", OUT_DIR)

if __name__ == "__main__":
    main()

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,1.3099
100,0.2504
150,0.2287
200,0.2112
250,0.2085
300,0.1965
350,0.1903
400,0.189
450,0.19
500,0.1844


Saved finetuned model to ./cad_llm


# **generate cylinder step file**

In [None]:
# generate_step.py
import sys
import re
import textwrap
import os
import builtins

# For headless environments
if 'DISPLAY' not in os.environ:
    os.environ['DISPLAY'] = ':99'

try:
    import cadquery as cq
except ImportError as e:
    print("❌ CadQuery import failed. Please install system dependencies:")
    print("sudo apt-get install -y libgl1-mesa-glx libgl1-mesa-dev libegl1-mesa")
    sys.exit(1)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_DIR = "./cad_llm"
OUT_STEP = "generated_from_llm.step"
MAX_NEW_TOKENS = 512
MAX_INPUT_LEN = 256

# ---------------- SAFE BUILTINS ----------------
def get_safe_builtins():
    """Return a restricted set of Python builtins (safe for CadQuery)."""
    allowed = {
        "abs", "min", "max", "round",
        "int", "float", "len", "range",
        "enumerate", "__import__"  # allow imports
    }
    return {k: getattr(builtins, k) for k in allowed}

# ---------------- LOAD MODEL ----------------
def load_model(model_dir=MODEL_DIR):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto")
        model.eval()
        return tokenizer, model
    except Exception as e:
        print(f"❌ Failed to load model from {model_dir}: {e}")
        sys.exit(1)

# ---------------- GENERATE CODE ----------------
def generate_code(prompt: str, tokenizer, model):
    prompt_text = prompt.strip() + "\n### CADQUERY CODE\n"
    try:
        inputs = tokenizer(prompt_text, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
    except Exception as e:
        raise RuntimeError(f"Tokenization failed: {str(e)}")

    with torch.no_grad():
        try:
            gen = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                num_beams=4,
                do_sample=False,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id
            )
        except Exception as e:
            raise RuntimeError(f"Model generation failed: {str(e)}")

    if gen is None or len(gen) == 0:
        raise RuntimeError("Model returned empty generation")

    try:
        output_seq = gen[0] if isinstance(gen, torch.Tensor) else torch.tensor(gen)
        output_seq = output_seq.cpu().tolist()
        out = tokenizer.decode(output_seq, skip_special_tokens=True)
    except Exception as e:
        raise RuntimeError(f"Decoding failed: {str(e)}")

    if not out.strip():
        raise RuntimeError("Decoded output is empty")

    # Extract code
    if "### CADQUERY CODE" in out:
        parts = out.split("### CADQUERY CODE", 1)
        code = parts[1].strip() if len(parts) > 1 else out
    else:
        m = re.search(r"(import\s+cadquery\b.*)", out, re.S | re.I)
        code = m.group(1).strip() if m else out

    if not code.strip():
        raise RuntimeError("No code extracted from generation")
    return code

# ---------------- CLEAN CODE ----------------
def clean_generated_code(code: str) -> str:
    # Strip markdown fences
    code = re.sub(r"\`\`\`[a-zA-Z]*", "", code).strip("` \n")

    # Normalize imports/assignments
    code = re.sub(r"\s*(import\s+cadquery)", r"\n\1", code, flags=re.IGNORECASE)
    code = re.sub(r"\s*(yield_strength|fos|P|cyl_id|cyl_len|cyl_thk|cyl_od|model|result|part)\s*=",
                  r"\n\1 = ", code)

    # Remove excessive trailing `)` spam
    lines = code.splitlines()
    cleaned_lines = []
    paren_balance = 0
    for line in lines:
        if line.strip() == ")":
            if paren_balance <= 0:  # too many closing parens, skip
                continue
        paren_balance += line.count("(") - line.count(")")
        cleaned_lines.append(line)

    code = "\n".join(cleaned_lines)

    # Balance parentheses if still mismatched
    open_parens = code.count('(')
    close_parens = code.count(')')
    if open_parens > close_parens:
        code += '\n' + (')' * (open_parens - close_parens))

    return textwrap.dedent(code).strip()

# ---------------- SANITY CHECK ----------------
def simple_sanity_check(code_text: str):
    forbidden = ["__import__", "import os", "open(", "eval(", "exec(",
                 "subprocess", "socket", "requests", "urllib", "os.", "sys."]
    lower = code_text.lower()
    for f in forbidden:
        if f in lower:
            raise RuntimeError(f"Forbidden pattern found in generated code: {f}")

    if "import cadquery" not in lower and not re.search(r"\b(model|result|part)\s*=", code_text):
        raise RuntimeError("Generated code missing CadQuery import or model assignment")
    return True

# ---------------- EXECUTE + EXPORT ----------------
def exec_and_export(code_text: str, out_file=OUT_STEP):
    safe_globals = {"cq": cq, "__builtins__": get_safe_builtins()}
    safe_locals = {}

    try:
        exec(code_text, safe_globals, safe_locals)
    except Exception as e:
        raise RuntimeError(f"Error during exec: {e}")

    cad_obj = None
    for name in ("model", "result", "part"):
        if name in safe_locals:
            cad_obj = safe_locals[name]
            break
        if name in safe_globals:
            cad_obj = safe_globals[name]
            break

    if cad_obj is None:
        raise RuntimeError("No CAD object found. Expected 'model'/'result'/'part' variable.")

    cq.exporters.export(cad_obj, out_file)
    return out_file

# ---------------- MAIN ----------------
def main():
    print("🔧 CadQuery Code Generator")
    print("=" * 50)

    prompt = input("Enter your CAD description: ").strip()
    if not prompt:
        print("❌ Please provide a valid prompt.")
        return

    print(f"\n🤖 Processing prompt: '{prompt}'")
    print("Loading model...")
    tokenizer, model = load_model(MODEL_DIR)
    print("✅ Model loaded successfully")

    try:
        print("🔄 Generating CadQuery code...")
        raw_code = generate_code(prompt, tokenizer, model)

        print("\n--- Raw generated code (first 1000 chars) ---")
        print(raw_code[:1000] + ("..." if len(raw_code) > 1000 else ""))

        print("\n🧹 Cleaning generated code...")
        code = clean_generated_code(raw_code)
        print("\n--- Cleaned code ---")
        print(code)

        print("\n🔍 Performing sanity checks...")
        simple_sanity_check(code)

        print("\n⚙️ Executing code and exporting STEP file...")
        out_file = exec_and_export(code)
        print(f"\n✅ SUCCESS! STEP file created: {out_file}")
        print(f"📁 File size: {os.path.getsize(out_file)} bytes")

    except Exception as e:
        print(f"\n❌ ERROR: {e}")
        try:
            debug_file = "last_raw_code.txt"
            with open(debug_file, "w") as f:
                f.write(raw_code if 'raw_code' in locals() else "No code generated")
            print(f"🐛 Debug info saved to {debug_file}")
        except:
            pass

if __name__ == "__main__":
    main()


🔧 CadQuery Code Generator
Enter your CAD description: design a cylinder with bore 80mm, length 150mm and a pressure of 6 bar

🤖 Processing prompt: 'design a cylinder with bore 80mm, length 150mm and a pressure of 6 bar'
Loading model...
✅ Model loaded successfully
🔄 Generating CadQuery code...

--- Raw generated code (first 1000 chars) ---
import cadquery as cq

yield_strength = 205  # MPa
fos = 5
P = 0.678  # MPa
cyl_id = 80
cyl_len = 150
cyl_thk = 1.56
cyl_od = cyl_id + 2 * cyl_thk

model = (
     cq.Workplane("XY")
    .circle(cyl_od/2)
    .circle(cyl_id/2)
    .extrude(cyl_len)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)
)

🧹 Cleaning generated co

In [None]:
# prompt: design a cylinder with id 80mm, 150mm stroke length, and pressure of 6 bar

# **zip files**

In [None]:
!zip -r my_files.zip ./*

  adding: cad_llm/ (stored 0%)
  adding: cad_llm/runs/ (stored 0%)
  adding: cad_llm/runs/Aug18_09-40-52_5649280bfb3c/ (stored 0%)
  adding: cad_llm/runs/Aug18_09-40-52_5649280bfb3c/events.out.tfevents.1755510052.5649280bfb3c.1176.1 (deflated 68%)
  adding: cad_llm/checkpoint-7500/ (stored 0%)
  adding: cad_llm/checkpoint-7500/rng_state.pth (deflated 25%)
  adding: cad_llm/checkpoint-7500/optimizer.pt (deflated 8%)
  adding: cad_llm/checkpoint-7500/scaler.pt (deflated 60%)
  adding: cad_llm/checkpoint-7500/config.json (deflated 51%)
  adding: cad_llm/checkpoint-7500/vocab.json (deflated 59%)
  adding: cad_llm/checkpoint-7500/merges.txt (deflated 53%)
  adding: cad_llm/checkpoint-7500/trainer_state.json (deflated 83%)
  adding: cad_llm/checkpoint-7500/tokenizer_config.json (deflated 55%)
  adding: cad_llm/checkpoint-7500/tokenizer.json (deflated 82%)
  adding: cad_llm/checkpoint-7500/model.safetensors (deflated 7%)
  adding: cad_llm/checkpoint-7500/special_tokens_map.json (deflated 53%)

In [None]:
from google.colab import files
files.download('my_files.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **push/save to hf**

In [None]:
!pip install huggingface_hub
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `cadgpt` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authe

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfApi

HF_REPO_ID = "polaris314/cad-llm"
api = HfApi()

try:
    api.create_repo(repo_id=HF_REPO_ID, repo_type="model", exist_ok=True)
    print(f"✅ Repository '{HF_REPO_ID}' created or already exists.")
except Exception as e:
    print(f"❌ Failed to create repository: {e}")

✅ Repository 'polaris314/cad-llm' created or already exists.


In [None]:
from huggingface_hub import HfApi
from transformers import AutoModelForCausalLM, AutoTokenizer

# your HF repo id
HF_REPO_ID = "polaris314/cad-llm" # Make sure this is the correct repo ID

# load local model
model = AutoModelForCausalLM.from_pretrained("./cad_llm")
tokenizer = AutoTokenizer.from_pretrained("./cad_llm")

# push to hub
api = HfApi()
api.upload_folder(
    folder_path="./cad_llm",
    repo_id=HF_REPO_ID,
    repo_type="model"
)

print(f"✅ Uploaded model to https://huggingface.co/{HF_REPO_ID}")

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/996M [00:00<?, ?B/s]

Upload 21 LFS files:   0%|          | 0/21 [00:00<?, ?it/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/996M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/996M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

events.out.tfevents.1755510052.5649280bfb3c.1176.1:   0%|          | 0.00/37.2k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

✅ Uploaded model to https://huggingface.co/polaris314/cad-llm


In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())


True
1


In [None]:
!nvidia-smi

Mon Sep  1 11:26:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   61C    P8             10W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# **air motor**

# **generate dataset**

In [None]:
import pandas as pd, json

# Path to your uploaded Excel in Colab
in_path = "/content/air_motor.xlsx"
out_stem = "/content/air_motor_prompts"

# --- required columns ---
REQUIRED_COLS = [
    "pressure_bar","pressure_mpa","stroke_length",
    "cyl_id","cyl_len","cyl_thk","cyl_od",
    "disc_dia","disc_thk","thru_hole","counterbore","groove_dia","groove_height",
    "head_dia","head_length","neck_dia","neck_length","chamf_dist",
    "piston_dia","piston_length","ext_length_A","ext_dia_A","threaded_depth",
    "flange_dia","flange_thk","hub_od","hub_id","hub_length",
    "ext_dia_B","ext_length_B","center_hole_dia","center_hole_depth",
    "bolt_hole_radius","pattern_radius","small_radius","n_bolts",
]

# --- load file ---
df = pd.read_excel(in_path)

missing = [c for c in REQUIRED_COLS if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

# --- prompt template ---
import random

PROMPT_TEMPLATES = {
    "piston_disc": [
        "Generate a piston disc for an air motor with cylinder ID {cyl_id}mm, stroke length {stroke_length}mm, designed for pressure {pressure_bar} bar.",
        "Design a piston disc for a cylinder of ID {cyl_id} mm and stroke {stroke_length} mm, operating at {pressure_bar} bar.",
        "Create a piston disc model for an air motor (cyl ID {cyl_id} mm, stroke {stroke_length} mm, pressure {pressure_bar} bar).",
        "Build the piston disc of an air motor: bore {cyl_id} mm, stroke {stroke_length} mm, working pressure {pressure_bar} bar."
    ],
    "piston_rod": [
        "Generate a piston rod for an air motor with cylinder ID {cyl_id} mm, stroke {stroke_length} mm, and pressure {pressure_bar} bar.",
        "Design a piston rod for a cylinder (bore {cyl_id} mm, stroke {stroke_length} mm, pressure {pressure_bar} bar).",
        "Create a piston rod model: cyl ID {cyl_id} mm, stroke length {stroke_length} mm, pressure {pressure_bar} bar.",
        "Build the piston rod for an air motor bore {cyl_id} mm, stroke {stroke_length} mm, at {pressure_bar} bar."
    ],
    "flange": [
        "Generate a flange for an air motor with cylinder ID {cyl_id} mm, stroke {stroke_length} mm, and pressure {pressure_bar} bar.",
        "Design a mounting flange for a cylinder (ID {cyl_id} mm, stroke {stroke_length} mm, pressure {pressure_bar} bar).",
        "Create a flange for an air motor: bore {cyl_id} mm, stroke {stroke_length} mm, working pressure {pressure_bar} bar.",
        "Build a flange component for cylinder ID {cyl_id} mm, stroke {stroke_length} mm, operating at {pressure_bar} bar."
    ]
}

def prompt_for(part: str, row: pd.Series) -> str:
    tpl = random.choice(PROMPT_TEMPLATES[part])
    return tpl.format(
        cyl_id=int(round(float(row["cyl_id"]))),
        stroke_length=int(round(float(row["stroke_length"]))),
        pressure_bar=float(row["pressure_bar"])
    )


# --- cadquery code templates ---
CQ_TEMPLATES = {
    "piston_disc": r"""
# piston disc
cbore_height = 3
piston_disc = (cq.Workplane("XY").circle(disc_dia/2).extrude(disc_thk))
piston_disc = (piston_disc.faces(">Z").workplane()
               .cboreHole(thru_hole, counterbore, cbore_height, depth=None))
piston_disc = (piston_disc.faces(">Z").workplane(offset = -(disc_thk - groove_height)/2)
               .circle(groove_dia/2).circle(disc_dia/2).cutBlind(-groove_height))
""",
    "piston_rod": r"""
# piston rod
head = (cq.Workplane("XY").circle(head_dia/2).extrude(head_length).edges("<Z").chamfer(2.5))
neck = (head.faces(">Z").circle(neck_dia/2).extrude(neck_length))
shoulder = (neck.faces(">Z").circle((neck_dia + 5)/2)
            .workplane(offset=chamf_dist).circle(piston_dia/2).loft(combine="a"))
piston = (shoulder.faces(">Z").circle(piston_dia/2).extrude(piston_length))
piston = (piston.faces(">Z").workplane().circle(ext_dia_A/2).extrude(ext_length_A).fillet(0.5))
""",
    "flange": r"""
# flange
flange = (cq.Workplane("XY").circle(flange_dia/2).extrude(flange_thk).edges(">Z").fillet(1))
flange = (flange.faces(">Z").workplane(centerOption="CenterOfMass")
          .polarArray(pattern_radius, 0, 360, int(n_bolts))
          .circle(small_radius).circle(bolt_hole_radius)
          .extrude(-flange_thk).edges("|Z").fillet(5))
flange = (flange.faces(">Z").workplane().hole(center_hole_dia, center_hole_depth))
flange_hub = (flange.faces(">Z").workplane().center(0,0)
              .circle(hub_od/2).circle(hub_id/2).extrude(hub_length))
flange = (flange_hub.faces(">Z").workplane()
          .circle(ext_dia_B/2).circle(hub_id/2).extrude(ext_length_B))

""",
}

def code_for(part, row):
    header_lines = ["import cadquery as cq", "", "# Parameters from dataset row"]
    for k, v in row.items():
        if pd.isna(v): continue
        try:
            fv = float(v)
            header_lines.append(f"{k} = {int(round(fv))}" if abs(fv-round(fv)) < 1e-9 else f"{k} = {fv}")
        except Exception:
            header_lines.append(f"{k} = {repr(v)}")
    return "\n".join(header_lines) + "\n" + CQ_TEMPLATES[part]

# --- generate dataset ---
rows_out = []
for idx, row in df.iterrows():
    for part in ["piston_disc","piston_rod","flange"]:
        rows_out.append({
            "row_id": int(idx),
            "part": part,
            "prompt": prompt_for(part,row),
            "completion": code_for(part,row),
        })

out_csv   = out_stem + ".csv"
out_jsonl = out_stem + ".jsonl"

pd.DataFrame(rows_out).to_csv(out_csv, index=False)
with open(out_jsonl,"w",encoding="utf-8") as f:
    for rec in rows_out:
        f.write(json.dumps(rec, ensure_ascii=False)+"\n")

print(f"✅ Saved {out_csv} and {out_jsonl} with {len(rows_out)} samples "
      f"(3 per input row).")


✅ Saved /content/air_motor_prompts.csv and /content/air_motor_prompts.jsonl with 30000 samples (3 per input row).


# **train gpt2 on generated data**

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
pip install -U transformers datasets accelerate


Collecting transformers
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohtt

In [None]:
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from google.colab import userdata # Import userdata

# CONFIG
DATA_JSON = "/content/air_motor_prompts.jsonl"  # adjust path if needed
BASE_MODEL = "gpt2"
OUT_DIR = "/content/cad_llm"
EPOCHS = 2              # start with 2, bump to 3 if eval loss still dropping
BATCH = 2               # per-device batch
GRAD_ACCUM = 4          # effective batch = BATCH * GRAD_ACCUM
MAX_LENGTH = 1024       # GPT-2 context size
LR = 5e-5
SEED = 42

# Get Hugging Face Token from Colab Secrets
# Make sure you have saved your token in Colab Secrets named 'HF_TOKEN'
HF_TOKEN = userdata.get('HF_TOKEN')
if not HF_TOKEN:
    print("⚠️ Warning: Hugging Face token not found in Colab Secrets ('HF_TOKEN').")
    print("Please add your token to Colab Secrets or ensure it's available as an environment variable.")
    # You can still try to proceed, but loading models might fail if they require auth.


def make_text(example):
    sep = "\n### CADQUERY CODE\n"
    prompt = (example.get("prompt") or "").strip()
    code = (example.get("completion") or "").strip()  # <-- FIXED
    return {"text": prompt + sep + code + "\n"}

def main():
    # load json dataset
    raw_ds = load_dataset("json", data_files=DATA_JSON, split="train")
    # tiny eval split (2%)
    splits = raw_ds.train_test_split(test_size=0.02, seed=SEED)
    train_ds, eval_ds = splits["train"], splits["test"]

    # combine fields -> training text
    train_ds = train_ds.map(make_text, remove_columns=train_ds.column_names)
    eval_ds  = eval_ds.map(make_text, remove_columns=eval_ds.column_names)

    # Pass the token explicitly when loading tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, token=HF_TOKEN)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})

    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, token=HF_TOKEN)
    model.resize_token_embeddings(len(tokenizer))  # <-- align with tokenizer

    def tokenize_fn(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            max_length=MAX_LENGTH,
            padding="max_length",
            return_attention_mask=True,
        )

    train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
    eval_tok  = eval_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=OUT_DIR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH,
        gradient_accumulation_steps=GRAD_ACCUM,   # effective batch > 8k tokens/step
        learning_rate=LR,
        warmup_steps=500,
        weight_decay=0.01,
        logging_steps=50,
        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        fp16=True,   # safe for Colab T4
        report_to="none",
        seed=SEED,
        # Pass the token to the Trainer arguments as well (optional but good practice)
        hub_token=HF_TOKEN,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=eval_tok,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    trainer.train()
    trainer.save_model(OUT_DIR)
    tokenizer.save_pretrained(OUT_DIR)
    print("✅ Saved finetuned model to", OUT_DIR)

if __name__ == "__main__":
    main()

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/29400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/29400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
200,0.302,0.185147
400,0.1246,0.100483
600,0.0863,0.068016
800,0.0702,0.06097
1000,0.0606,0.050167
1200,0.0562,0.045852
1400,0.053,0.045341
1600,0.0473,0.040319
1800,0.0462,0.038916
2000,0.0447,0.03776


Step,Training Loss,Validation Loss
200,0.302,0.185147
400,0.1246,0.100483
600,0.0863,0.068016
800,0.0702,0.06097
1000,0.0606,0.050167
1200,0.0562,0.045852
1400,0.053,0.045341
1600,0.0473,0.040319
1800,0.0462,0.038916
2000,0.0447,0.03776


In [None]:
import os, inspect
from packaging import version
from datasets import load_dataset
from transformers import (
    __version__ as HF_VERSION,
    AutoTokenizer, AutoModelForCausalLM,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
)

# CONFIG
DATA_JSON = "/content/air_motor_prompts.jsonl"   # adjust path
BASE_MODEL = "gpt2"
OUT_DIR = "/content/cad_llm"
EPOCHS = 2
BATCH = 2
GRAD_ACCUM = 4
MAX_LENGTH = 1024
LR = 5e-5
SEED = 42

def make_text(example):
    sep = "\n### CADQUERY CODE\n"
    prompt = (example.get("prompt") or "").strip()
    code = (example.get("completion") or "").strip()   # your dataset uses 'completion'
    return {"text": prompt + sep + code + "\n"}

def _supports(arg_name):
    return arg_name in inspect.signature(TrainingArguments.__init__).parameters

def main():
    # Dataset
    raw = load_dataset("json", data_files=DATA_JSON, split="train")
    splits = raw.train_test_split(test_size=0.02, seed=SEED) # FIX: Corrected typo
    train_ds, eval_ds = splits["train"], splits["test"]
    train_ds = train_ds.map(make_text, remove_columns=train_ds.column_names)
    eval_ds  = eval_ds.map(make_text,  remove_columns=eval_ds.column_names)

    # Tokenizer + model
    tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
    if tok.pad_token is None:
        tok.add_special_tokens({"pad_token": tok.eos_token})
    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
    model.resize_token_embeddings(len(tok))

    def tok_fn(batch):
        return tok(batch["text"], truncation=True, max_length=MAX_LENGTH,
                   padding="max_length", return_attention_mask=True)

    train_tok = train_ds.map(tok_fn, batched=True, remove_columns=["text"])
    eval_tok  = eval_ds.map(tok_fn,  batched=True, remove_columns=["text"])

    collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

    # ---- Build TrainingArguments with only supported kwargs ----
    ta_kwargs = dict(
        output_dir=OUT_DIR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH,
        gradient_accumulation_steps=GRAD_ACCUM,
        learning_rate=LR,
        logging_steps=50,
        fp16=True,
        seed=SEED,
        save_total_limit=3,
        # Keep save_strategy, but remove eval strategy and related args
        save_strategy="epoch",
        # Removed: evaluation_strategy="epoch",
        # Removed: load_best_model_at_end=True,
        # Removed: metric_for_best_model="eval_loss",
        # Removed: greater_is_better=False,
    )
    # Remove steps arguments as they are not used with epoch strategy
    ta_kwargs.pop('save_steps', None)
    ta_kwargs.pop('eval_steps', None) # Ensure eval_steps is removed

    # common extras (add if supported in your version)
    if _supports("warmup_steps"):        ta_kwargs["warmup_steps"] = 500
    if _supports("weight_decay"):        ta_kwargs["weight_decay"] = 0.01
    if _supports("report_to"):                ta_kwargs["report_to"] = "none"
    # Pass the token explicitly if available (needed for hub features like push_to_hub)
    from google.colab import userdata # Import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    if HF_TOKEN and _supports("hub_token"):
         ta_kwargs["hub_token"] = HF_TOKEN


    args = TrainingArguments(**ta_kwargs)

    # Some very old Trainers may not accept 'tokenizer' kwarg; guard it:
    trainer_kwargs = dict(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=eval_tok,     # harmless if eval strategy unsupported; will just be ignored
        data_collator=collator,
    )
    if "tokenizer" in inspect.signature(Trainer.__init__).parameters:
        trainer_kwargs["tokenizer"] = tok

    print(f"Using transformers {HF_VERSION} with args: {sorted(ta_kwargs.keys())}")
    trainer = Trainer(**trainer_kwargs)

    trainer.train()
    # Manual eval after training
    try:
        print("\nRunning manual evaluation after training...")
        metrics = trainer.evaluate(eval_dataset=eval_tok) # Pass eval_dataset explicitly
        print("Eval metrics:", metrics)
    except Exception as e:
        print("\nManual evaluation failed:", e)


    trainer.save_model(OUT_DIR)
    tok.save_pretrained(OUT_DIR)
    print("✅ Saved finetuned model to", OUT_DIR)

if __name__ == "__main__":
    main()

Map:   0%|          | 0/29400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/29400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Using transformers 4.56.0 with args: ['fp16', 'gradient_accumulation_steps', 'hub_token', 'learning_rate', 'logging_steps', 'num_train_epochs', 'output_dir', 'per_device_train_batch_size', 'report_to', 'save_strategy', 'save_total_limit', 'seed', 'warmup_steps', 'weight_decay']


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,2.8808
100,1.7993
150,0.695
200,0.3019
250,0.2043
300,0.1635
350,0.1419
400,0.1247
450,0.1148
500,0.1008


In [None]:
AutoModelForCausalLM.from_pretrained("polaris314/cad-llm") and AutoTokenizer.from_pretrained("polaris314/cad-llm")

It looks like there's an authentication issue when trying to download the base model from Hugging Face. Please run the cell below to log in to your Hugging Face account.

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

It looks like your `transformers` library is outdated. Let's upgrade it to a version that supports the arguments used in the training code.

In [None]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/250.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m245.8/250.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [None]:
!pip install wandb -qqq
import wandb

wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter: