# Training Notebook


ECE-GY 7123 / CS-GY 6953 / Deep Learning - Fall '25 - Final Project

**Team:** Spline Reticulator

**Author/Member:** Thanh Do (qd2121@nyu.edu)

## Libraries

In [11]:
!pip install coverage xmlschema



In [12]:
import os
import glob
import torch
import sys
import coverage
import xmlschema
import math
import re
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
from tqdm import tqdm

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Reproducibility

In [14]:
import random
def set_seed(seed = 0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(123456)

## Constants

In [15]:
# Base model name
MODEL_NAME = "Qwen/Qwen3-0.6B"
# W3C dataset
DATA_DIR = "./dataset"
# New token limit to fit inside VRAM
MAX_NEW_TOKENS = 256
# Cap total number of samples that we'll use for training
MAX_DATASET_LENGTH = 4096
# Maximum length for the tokenizer
MAX_LENGTH = 1024

# LoRA parameters
LORA_RANK = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

# Training parameters
OPTIMIZER = "adamw_torch"
EPOCHS = 15
LR = 2e-4
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 5

# Evaluation parameters
# Numbers of XML files to be sampled from the models
GENERATE_COUNT = 100

## Data

In [16]:
!wget https://github.com/w3c/xsdtests/archive/refs/heads/master.zip
!unzip -q master.zip
!mv xsdtests-master/ dataset
!rm master.zip

--2025-12-18 18:22:17--  https://github.com/w3c/xsdtests/archive/refs/heads/master.zip
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/w3c/xsdtests/zip/refs/heads/master [following]
--2025-12-18 18:22:17--  https://codeload.github.com/w3c/xsdtests/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.121.10
Connecting to codeload.github.com (codeload.github.com)|140.82.121.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘master.zip’

master.zip              [           <=>      ]  26.15M  12.5MB/s    in 2.1s    

2025-12-18 18:22:19 (12.5 MB/s) - ‘master.zip’ saved [27425403]



In [17]:
def minify_xml(content):
  """
  Removes comments and spaces between tags from a XML file.
  """
  import re
  content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
  content = re.sub(r'>\s+<', '><', content)
  return content

def load_and_process_data(tokenizer):
  """
  Globs and loads all XML/XSD files from the dataset.
  """
  print(f"Loading XML/XSD files from {DATA_DIR}...")
  files = glob.glob(f"{DATA_DIR}/**/*.xsd", recursive=True)

  data = []
  print(f"Found {len(files)} files.")
  skipped = 0
  processed = 0

  random.shuffle(files)

  for f_path in files:
    try:
      with open(f_path, 'r', encoding='utf-8', errors='ignore') as f:
        content = minify_xml(f.read())
        # Only train on XML schemas
        if '"http://www.w3.org/2001/XMLSchema"' not in content:
          skipped += 1
          continue
        processed += 1
        # Ensure that we don't truncate anything
        tokens = tokenizer.encode(content, add_special_tokens=True)
        if len(tokens) < MAX_NEW_TOKENS:
          data.append({"text": content})
          if len(data) >= MAX_DATASET_LENGTH:
            break
        else:
          skipped += 1
    except Exception:
      continue

  print(f"Processed {processed} samples, skipped {skipped} samples.")

  full_dataset = Dataset.from_list(data)
  split_dataset = full_dataset.train_test_split(test_size=0.1) # 10% for Validation

  return split_dataset["train"], split_dataset["test"]

## Evaluation Pipeline

In [18]:
def extract_first_xml_block(text):
  """
  Extracts the first complete XML document, including Declaration/Doctype.
  1. Finds the first '<'.
  2. Tracks depth starting from the first actual Element.
  3. Stops when that Root Element closes.
  """
  # 1. Find the very first '<' (Skip leading whitespace/text)
  start_match = re.search(r'<', text)
  if not start_match:
    return None

  start_index = start_match.start()

  # 2. Regex to identify XML structures
  # Groups:
  # 1. Comments/Declarations/Doctype: <!-- ... --> OR <? ... ?> OR <! ... >
  # 2. Closing Tag: </ ... >
  # 3. Self-Closing Tag: < ... />
  # 4. Opening Tag: < ... >
  tag_pattern = re.compile(
    r'(<!--.*?-->|<\?.*?\?>|<![^>]*>)|(</[^>]+>)|(<[^>]+/>)|(<[^>]+>)',
    re.DOTALL
  )

  depth = 0
  root_found = False

  # We scan starting from the first '<'
  # match.end() gives us the position in the original string
  for match in tag_pattern.finditer(text, pos=start_index):
    full_tag = match.group(0)

    # Group 1: Meta-tags (Comments, <?xml?>, <!DOCTYPE>)
    if match.group(1):
      # These exist at depth 0 but don't start the root element structure
      continue

    # Group 2: Closing Tag </foo>
    elif match.group(2):
      if root_found:
        depth -= 1
        if depth == 0:
          # ROOT CLOSED! Return everything up to this character
          return text[start_index:match.end()]
      else:
        # Found a closing tag before an opening one? Malformed.
        return None

    # Group 3: Self-Closing Tag <foo />
    elif match.group(3):
      if not root_found:
        # If this is the first element, it's the root. And it ends here.
        return text[start_index:match.end()]
      # If we are inside root, it's just a child, depth unchanged.

    # Group 4: Opening Tag <foo>
    elif match.group(4):
      if not root_found:
        root_found = True
        depth = 1
      else:
        depth += 1

  # If loop finishes and we didn't return, the file is truncated/incomplete
  return None

def generate_samples(model, tokenizer, save_folder, prefix="gen", gen_cnt=GENERATE_COUNT):
  """
  Generates samples and saves them to a folder.
  """
  print(f"Generating {gen_cnt} samples into {save_folder}...")
  os.makedirs(save_folder, exist_ok=True)
  model.eval()

  # Prompts to trigger XSD generation (hopefully)
  prompts = [
    "<?xml version=",
    "<xs:schema",
  ]

  attempt = 0
  saved_count = 0
  pbar = tqdm(total=gen_cnt)

  while saved_count < gen_cnt:
    attempt += 1
    prompt = prompts[saved_count % len(prompts)]
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
      outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=0.8,
        top_p=0.8,
        top_k=20,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
      )

      generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
      generated_text = extract_first_xml_block(generated_text)
      if generated_text is None:
        # Break if we're stuck
        if attempt >= (4 * gen_cnt):
          print("WARN: We're unable to generate enough valid samples.")
          break
        continue

      # Save to file
      with open(f"{save_folder}/{prefix}_{saved_count}.xml", "w", encoding="utf-8") as f:
        f.write(generated_text)

      # Update progress bar
      pbar.update(1)
      saved_count += 1

  pbar.close()


## Coverage Evaluation

In [31]:
from xmlschema import XMLSchema10, XMLSchema11

XMLSchema10.meta_schema.build()
XMLSchema11.meta_schema.build()

def run_coverage_on_files(file_list, description):
    print(f"\n--- Testing {description} ({len(file_list)} files) ---")

    # Initialize coverage
    cov = coverage.Coverage(source=["xmlschema"])
    cov.start()

    valid_count = 0
    error_count = 0

    for xml_file in file_list:
        try:
            # We try to validate the file.
            # We expect errors (it's a fuzzer!), but we want to see
            xmlschema.XMLSchema(xml_file)
            valid_count += 1
        except Exception:
            # This is expected. We just want the code paths executed.
            error_count += 1

    cov.stop()

    print(f"Valid XMLs: {valid_count} | Invalid/Errors: {error_count}")

    # Calculate metrics
    percent = cov.report(show_missing=False, file=sys.stdout)
    return percent

## Baseline

In [20]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {device}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModelForCausalLM.from_pretrained(
  MODEL_NAME,
  dtype="auto",
  device_map="auto",
).to(device)

Running on cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [21]:
print("\n--- Phase 1: Zero-Shot Baseline ---")
generate_samples(base_model, tokenizer, f"baseline_samples")



--- Phase 1: Zero-Shot Baseline ---
Generating 100 samples into baseline_samples...


100%|██████████| 100/100 [27:33<00:00, 16.54s/it]


### Evaluate Baseline

In [36]:
baseline_files = glob.glob(f"./baseline_samples/*.xml")
base_score = run_coverage_on_files(baseline_files, "Zero-Shot Baseline")


--- Testing Zero-Shot Baseline (100 files) ---
Valid XMLs: 1 | Invalid/Errors: 99
Name                                                                              Stmts   Miss  Cover
-----------------------------------------------------------------------------------------------------
/usr/local/lib/python3.12/dist-packages/xmlschema/__init__.py                        20     20     0%
/usr/local/lib/python3.12/dist-packages/xmlschema/_limits.py                          4      4     0%
/usr/local/lib/python3.12/dist-packages/xmlschema/aliases.py                         70     70     0%
/usr/local/lib/python3.12/dist-packages/xmlschema/arguments.py                      283    242    14%
/usr/local/lib/python3.12/dist-packages/xmlschema/cli.py                            151    151     0%
/usr/local/lib/python3.12/dist-packages/xmlschema/converters/__init__.py             21     19    10%
/usr/local/lib/python3.12/dist-packages/xmlschema/converters/abdera.py               94     94     0%

## Post-train

In [23]:
print("\n--- Phase 2: LoRA Post-Trained ---")

# Prepare LoRA Config
peft_config = LoraConfig(
  task_type=TaskType.CAUSAL_LM,
  inference_mode=False,
  r=LORA_RANK,
  lora_alpha=LORA_ALPHA,
  lora_dropout=LORA_DROPOUT,
  target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

# Prepare Data
train_data, eval_data = load_and_process_data(tokenizer)
def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_eval = eval_data.map(tokenize_function, batched=True)

# Trainer
training_args = TrainingArguments(
  output_dir=f"./checkpoints",
  per_device_train_batch_size=4,
  gradient_accumulation_steps=4, # Simulates larger batch size
  learning_rate=LR,
  num_train_epochs=EPOCHS,
  logging_steps=10,
  fp16=(device=="cuda"),
  optim=OPTIMIZER,
  weight_decay=WEIGHT_DECAY,
  warmup_steps=WARMUP_STEPS,
  eval_strategy="epoch", # Evaluate at end of every epoch
  save_strategy="epoch",
  load_best_model_at_end=True, # Keep the best model (lowest loss)
  metric_for_best_model="eval_loss",
  greater_is_better=False,
  report_to="none"
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_train,
  eval_dataset=tokenized_eval,
  data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

print("\nComputing Final Perplexity...")
eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
perplexity = math.exp(eval_loss)

print(f"\n=======================================")
print(f"Final Validation Loss: {eval_loss:.4f}")
print(f"Final Perplexity:      {perplexity:.2f}")
print(f"=======================================\n")



--- Phase 2: LoRA Post-Trained ---
trainable params: 5,046,272 || all params: 601,096,192 || trainable%: 0.8395
Loading XML/XSD files from ./dataset...
Found 15464 files.
Processed 7004 samples, skipped 4906 samples.


Map:   0%|          | 0/3686 [00:00<?, ? examples/s]

Map:   0%|          | 0/410 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Epoch,Training Loss,Validation Loss
1,0.2893,0.289744
2,0.2554,0.239701
3,0.1947,0.217499
4,0.1516,0.212891
5,0.1414,0.208672
6,0.1235,0.206
7,0.1155,0.215266
8,0.0972,0.213791
9,0.0902,0.215265
10,0.0842,0.220822



Computing Final Perplexity...



Final Validation Loss: 0.2060
Final Perplexity:      1.23



## Evaluate Post-train

In [24]:
generate_samples(model, tokenizer, f"lora_samples")

Generating 100 samples into lora_samples...


100%|██████████| 100/100 [34:40<00:00, 20.81s/it]


## Result

In [35]:
lora_files = glob.glob(f"./lora_samples/*.xml")
lora_score = run_coverage_on_files(lora_files, "LoRA")


--- Testing LoRA (100 files) ---


  xmlschema.XMLSchema(xml_file)


Valid XMLs: 15 | Invalid/Errors: 85
Name                                                                              Stmts   Miss  Cover
-----------------------------------------------------------------------------------------------------
/usr/local/lib/python3.12/dist-packages/xmlschema/__init__.py                        20     20     0%
/usr/local/lib/python3.12/dist-packages/xmlschema/_limits.py                          4      4     0%
/usr/local/lib/python3.12/dist-packages/xmlschema/aliases.py                         70     70     0%
/usr/local/lib/python3.12/dist-packages/xmlschema/arguments.py                      283    238    16%
/usr/local/lib/python3.12/dist-packages/xmlschema/cli.py                            151    151     0%
/usr/local/lib/python3.12/dist-packages/xmlschema/converters/__init__.py             21     19    10%
/usr/local/lib/python3.12/dist-packages/xmlschema/converters/abdera.py               94     94     0%
/usr/local/lib/python3.12/dist-packages/xmlsch

## Artifact Checkpointing

In [26]:
# Define the path to save the model checkpoint in Google Drive
save_path = '/content/drive/MyDrive/xml_lora_model'

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model checkpoint and tokenizer saved to: {save_path}")

Model checkpoint and tokenizer saved to: /content/drive/MyDrive/xml_lora_model


In [27]:
!cp -r baseline_samples /content/drive/MyDrive/baseline_samples
!cp -r lora_samples /content/drive/MyDrive/lora_samples