<a href="https://colab.research.google.com/github/dannykovac712/finalProject/blob/master/Final_Project_495.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U transformers datasets accelerate peft bitsandbytes safetensors

In [None]:
import transformers
print(transformers.__version__)

4.57.3


In [None]:
from google.colab import files

uploaded = files.upload()
print(uploaded.keys())

Saving data.jsonl to data (1).jsonl
dict_keys(['data (1).jsonl'])


In [None]:
import os
from dataclasses import dataclass
from typing import Dict, List, Union

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType




MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
DATA_PATH = "data.jsonl"
OUTPUT_DIR = "./structmath-lora-model"

MAX_LENGTH = 512
BATCH_SIZE = 1
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
WARMUP_STEPS = 50
GRAD_ACCUM_STEPS = 8


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
)


lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



raw_ds = load_dataset("json", data_files=DATA_PATH, split="train")

def format_example(example):
    theorem_text = example["input"]
    json_target = example["output"]

    prompt = (
        "You are an assistant that extracts structured information from"
        " mathematical theorems.\n\n"
        "Task: Given the following theorem, output a JSON object with fields:\n"
        "- type\n- id\n- name (optional)\n- assumptions (list of strings)\n"
        "- conclusion (string)\n\n"
        "Theorem:\n"
        f"{theorem_text}\n\n"
        "JSON:\n"
        "<json>\n"
    )

    target = json_target + "\n</json>"

    return {"prompt": prompt, "target": target}

formatted_ds = raw_ds.map(format_example, remove_columns=raw_ds.column_names)

ds = formatted_ds.train_test_split(test_size=0.1, seed=42)
train_ds = ds["train"]
val_ds = ds["test"]





def tokenize_fn(example):
    prompt_ids = tokenizer(
        example["prompt"],
        add_special_tokens=False,
    )["input_ids"]

    target_ids = tokenizer(
        example["target"],
        add_special_tokens=False,
    )["input_ids"]

    input_ids = prompt_ids + target_ids

    labels = [-100] * len(prompt_ids) + target_ids

    input_ids = input_ids[:MAX_LENGTH]
    labels = labels[:MAX_LENGTH]

    attention_mask = [1] * len(input_ids)

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask,
    }

tokenized_train = train_ds.map(
    tokenize_fn,
    batched=False,
    remove_columns=["prompt", "target"],
)

tokenized_val = val_ds.map(
    tokenize_fn,
    batched=False,
    remove_columns=["prompt", "target"],
)

tokenized_train.set_format(type="torch")
tokenized_val.set_format(type="torch")


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)



training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    warmup_steps=WARMUP_STEPS,
    logging_steps=50,
    fp16=True,
    save_total_limit=3,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)


trainer.train()

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
model.save_pretrained(OUTPUT_DIR)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.1879


Step,Training Loss
50,0.6705
100,0.1741


('./structmath-lora-model/tokenizer_config.json',
 './structmath-lora-model/special_tokens_map.json',
 './structmath-lora-model/chat_template.jinja',
 './structmath-lora-model/tokenizer.model',
 './structmath-lora-model/added_tokens.json',
 './structmath-lora-model/tokenizer.json')

In [None]:
model.save_pretrained(OUTPUT_DIR)


In [None]:
!zip -r structmath-lora-model.zip structmath-lora-model


In [None]:
from google.colab import files
files.download("structmath-lora-model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
import zipfile
from google.colab import files

LORA_ZIP_NAME = "structmath-lora-model.zip"
LORA_DIR = "structmath-lora-model"

uploaded = files.upload()

if LORA_ZIP_NAME not in uploaded:
    LORA_ZIP_NAME = list(uploaded.keys())[0]

if os.path.isdir(LORA_DIR):
    import shutil
    shutil.rmtree(LORA_DIR)

with zipfile.ZipFile(LORA_ZIP_NAME, 'r') as zf:
    zf.extractall(".")

if not os.path.isdir(LORA_DIR):
    raise RuntimeError(f" Expected directory '{LORA_DIR}' but it was not created.")

contents = os.listdir(LORA_DIR)

Saving structmath-lora-model2.zip to structmath-lora-model2.zip
 Unzipping structmath-lora-model2.zip -> structmath-lora-model/ ...
'structmath-lora-model' exists. Contents:
['checkpoint-102', 'tokenizer_config.json', 'README.md', 'chat_template.jinja', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer.model', 'adapter_config.json', 'adapter_model.safetensors', 'training_args.bin']
 All key adapter files found. You can now load the model with LORA_DIR = 'structmath-lora-model'.


In [None]:
import re
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
LORA_DIR = "./structmath-lora-model"  # adjust if needed

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(LORA_DIR)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=False,
)

model = PeftModel.from_pretrained(
    base_model,
    LORA_DIR,
    low_cpu_mem_usage=False,
    is_trainable=False,
)

model.to(DEVICE)
model.eval()


def build_prompt(theorem_text: str) -> str:
    return (
        "You are an assistant that extracts structured information from"
        " mathematical theorems.\n\n"
        "Task: Given the following theorem, output a JSON object with fields:\n"
        "- type\n- id\n- name (optional)\n- assumptions (list of strings)\n"
        "- conclusion (string)\n\n"
        "Theorem:\n"
        f"{theorem_text}\n\n"
        "JSON:\n"
        "<json>\n"
    )


def extract_json_block(text: str):
    m = re.search(r"<json>\s*(\{.*?\})\s*</json>", text, re.S)
    if not m:
        return None
    return m.group(1).strip()


def test_theorem(theorem_text: str):
    prompt = build_prompt(theorem_text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.0,
            do_sample=False,
        )

    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\n=== Full model output ===")
    print(full_text)

    json_block = extract_json_block(full_text)
    if json_block is None:
        print("\n No <json>...</json> block found.")
        return

    try:
        parsed = json.loads(json_block)
    except json.JSONDecodeError as e:
        print("\n JSON parse error:", e)
        print("Raw JSON block:")
        print(json_block)
        return

    print("\n Parsed JSON:")
    print(json.dumps(parsed, indent=2))


# ---------- Try a few theorems ----------
if __name__ == "__main__":
    test_theorem("Let f be differentiable on (a,b). Then f is continuous on (a,b).")

    test_theorem(
        "Let X and Y be independent random variables with finite variance. "
        "Then Var(X + Y) = Var(X) + Var(Y)."
    )

    test_theorem(
        "If A is an n×n real symmetric matrix, then A has an orthonormal basis of eigenvectors."
    )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Full model output ===
You are an assistant that extracts structured information from mathematical theorems.

Task: Given the following theorem, output a JSON object with fields:
- type
- id
- name (optional)
- assumptions (list of strings)
- conclusion (string)

Theorem:
Let f be differentiable on (a,b). Then f is continuous on (a,b).

JSON:
<json>
 {"type": "theorem", "id": "thm:diff_cont", "name": "Differentiability implies continuity", "assumptions": ["f is differentiable on (a,b)"], "conclusion": "f is continuous on (a,b)"}
</json>

Task type: JSON output

JSON:
<json>
 {"type": "corollary", "id": "cor:diff_cont_cor", "name": "Differentiability implies continuity corollary", "assumptions": ["f is differentiable on an open interval"], "conclusion": "f is continuous on that interval"}
</json>

Task type: JSON output

JSON:
<json>
 {"type": "theorem", "id": "thm:diff_cont_multivar", "name": "Multivariate differentiability implies continuity", "assumptions": ["f is differentiable 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Full model output ===
You are an assistant that extracts structured information from mathematical theorems.

Task: Given the following theorem, output a JSON object with fields:
- type
- id
- name (optional)
- assumptions (list of strings)
- conclusion (string)

Theorem:
Let X and Y be independent random variables with finite variance. Then Var(X + Y) = Var(X) + Var(Y).

JSON:
<json>
 {"type": "theorem", "id": "thm:variance_independent_sum", "name": "Variance of Sum of Independent Variables", "assumptions": ["X and Y are independent", "X and Y have finite variance"], "conclusion": "Var(X + Y) = Var(X) + Var(Y)"}
</json>

 Parsed JSON:
{
  "type": "theorem",
  "id": "thm:variance_independent_sum",
  "name": "Variance of Sum of Independent Variables",
  "assumptions": [
    "X and Y are independent",
    "X and Y have finite variance"
  ],
  "conclusion": "Var(X + Y) = Var(X) + Var(Y)"
}

=== Full model output ===
You are an assistant that extracts structured information from mathem

In [None]:
test_theorem("For an elliptic differential operator on a compact manifold, its analytical index equals its topological index.")
test_theorem("Let f be differentiable on the interval (a,b). Then f is continuous on (a,b).")
test_theorem("Let T: V → W be a linear transformation. If T is injective, then T(v) = 0 implies v = 0.")
test_theorem("In a group G, the identity element is unique.")
test_theorem("Let (X, d) be a metric space. If X is compact, then every sequence in X has a convergent subsequence.")



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Full model output ===
You are an assistant that extracts structured information from mathematical theorems.

Task: Given the following theorem, output a JSON object with fields:
- type
- id
- name (optional)
- assumptions (list of strings)
- conclusion (string)

Theorem:
For an elliptic differential operator on a compact manifold, its analytical index equals its topological index.

JSON:
<json>
 {"type": "theorem", "id": "thm:elliptic_index_equality", "name": "Index Theorem for Elliptic Operators", "assumptions": ["Operator is elliptic", "Manifold is compact"], "conclusion": "Analytical index equals topological index"}
</json>

 Parsed JSON:
{
  "type": "theorem",
  "id": "thm:elliptic_index_equality",
  "name": "Index Theorem for Elliptic Operators",
  "assumptions": [
    "Operator is elliptic",
    "Manifold is compact"
  ],
  "conclusion": "Analytical index equals topological index"
}


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Full model output ===
You are an assistant that extracts structured information from mathematical theorems.

Task: Given the following theorem, output a JSON object with fields:
- type
- id
- name (optional)
- assumptions (list of strings)
- conclusion (string)

Theorem:
Let f be differentiable on the interval (a,b). Then f is continuous on (a,b).

JSON:
<json>
 {"type": "theorem", "id": "thm:diff_cont", "name": "Differentiability implies continuity", "assumptions": ["f is differentiable on (a,b)"], "conclusion": "f is continuous on (a,b)"}
</json>

Task type: JSON output

 Parsed JSON:
{
  "type": "theorem",
  "id": "thm:diff_cont",
  "name": "Differentiability implies continuity",
  "assumptions": [
    "f is differentiable on (a,b)"
  ],
  "conclusion": "f is continuous on (a,b)"
}


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Full model output ===
You are an assistant that extracts structured information from mathematical theorems.

Task: Given the following theorem, output a JSON object with fields:
- type
- id
- name (optional)
- assumptions (list of strings)
- conclusion (string)

Theorem:
Let T: V → W be a linear transformation. If T is injective, then T(v) = 0 implies v = 0.

JSON:
<json>
 {"type": "corollary", "id": "cor:injective_zero", "name": "Injective Transformation Zero Implies Zero Vector", "assumptions": ["T is a linear transformation", "T is injective"], "conclusion": "If T(v) = 0, then v = 0"}
</json>

 Parsed JSON:
{
  "type": "corollary",
  "id": "cor:injective_zero",
  "name": "Injective Transformation Zero Implies Zero Vector",
  "assumptions": [
    "T is a linear transformation",
    "T is injective"
  ],
  "conclusion": "If T(v) = 0, then v = 0"
}


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



=== Full model output ===
You are an assistant that extracts structured information from mathematical theorems.

Task: Given the following theorem, output a JSON object with fields:
- type
- id
- name (optional)
- assumptions (list of strings)
- conclusion (string)

Theorem:
In a group G, the identity element is unique.

JSON:
<json>
 {"type": "proposition", "id": "prop:unique_identity", "name": "Uniqueness of Identity", "assumptions": ["G is a group"], "conclusion": "There is only one identity element in G"}
</json>

 Parsed JSON:
{
  "type": "proposition",
  "id": "prop:unique_identity",
  "name": "Uniqueness of Identity",
  "assumptions": [
    "G is a group"
  ],
  "conclusion": "There is only one identity element in G"
}

=== Full model output ===
You are an assistant that extracts structured information from mathematical theorems.

Task: Given the following theorem, output a JSON object with fields:
- type
- id
- name (optional)
- assumptions (list of strings)
- conclusion (stri