# Develop transformer

## Import libraries

In [82]:
import json
import glob
import os
import random
import math
from tqdm import tqdm


import torch
from torch.utils.data import Dataset


from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace


from transformers import (
PreTrainedTokenizerFast,
GPT2Config,
GPT2LMHeadModel,
Trainer,
TrainingArguments,
pipeline,
)

import os
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "TRUE"

random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x142d90530>

## Load data

In [83]:
# Set the path to your dataset directory
path = os.path.expanduser("/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/*.json")

# Load training file names
files = glob.glob(path)

# Randomly select 100 files
sample_files = random.sample(files, min(100, len(files)))

# Read and store randomly sampled documents
sample_data = []
for path in sample_files:
    with open(path) as f:
        data = json.load(f)
        for entry in data:
            sample_data.append(entry.get("text", ""))

print("Number of sampled JSON files:", len(sample_files))
print("Snippet of first doc:\n", sample_data[0][:300], "...")

Number of sampled JSON files: 100
Snippet of first doc:
 alzheimer's disease and other types of dementia are the top cause for disabilities in later life and various types of experiments have been performed to understand the underlying mechanisms of the disease with the aim of coming up with potential drug targets. these experiments have been carried out  ...


### Set up training and heldout data

In [84]:
random.shuffle(sample_data)

train_ratio = 0.9
split_idx = int(len(sample_data) * train_ratio)

train_docs = sample_data[:split_idx]
heldout_docs = sample_data[split_idx:]

# Check snippet of train and heldout docs
print("Number of training documents:", len(train_docs))
print("Number of heldout documents:", len(heldout_docs))
print("Snippet of first training doc:\n", train_docs[0][:300], "...")
print("Snippet of first heldout doc:\n", heldout_docs[0][:300], "...")

Number of training documents: 1862
Number of heldout documents: 207
Snippet of first training doc:
 In vitro studies have revealed that SARS-CoV-2 can infect human lung tissue more effectively and replicate more efficiently compared with SARS-CoV. The number of viral particles in lung tissues infected by SARS-CoV-2 is more than 3.2 times the number of SARS-CoV within 48 h. [82] Blocking viral repl ...
Snippet of first heldout doc:
 The NIH Alzheimer's Disease Neuroimaging Initiative (ADNI) [15] is an ongoing five-year public-private partnership to test whether serial MRI, PET, SNP, other biological markers, and clinical and neuropsychological assessment can be combined to measure the progression of mild cognitive impairment (M ...


## Develop algorithm

### Train tokenizer

In [None]:
# Develop Wordpiece tokenizer
def train_tokenizer(documents, vocab_size=8000):
	tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
	tokenizer.pre_tokenizer = Whitespace()

	trainer = WordPieceTrainer(
		vocab_size=vocab_size,
		special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"]
	)

	tokenizer.train_from_iterator(documents, trainer)
	tokenizer.save("tokenizer.json")
	return tokenizer


raw_tokenizer = train_tokenizer(train_docs)

block_size = 128
data = {
	"input_ids": [],
	"labels": []
}

for doc in train_docs:
	tokens = raw_tokenizer.encode(doc).ids
	for i in range(0, len(tokens) - block_size, block_size):
		input_ids = tokens[i:i+block_size]
		labels = tokens[i+1:i+block_size+1]
		data["input_ids"].append(input_ids)
		data["labels"].append(labels)

hf_tokenizer = PreTrainedTokenizerFast(
	tokenizer_file="tokenizer.json",
	pad_token="[PAD]",
	unk_token="[UNK]",
	bos_token="[BOS]",
	eos_token="[EOS]",
)

hf_tokenizer.save_pretrained("wordpiece_tokenizer")






('wordpiece_tokenizer/tokenizer_config.json',
 'wordpiece_tokenizer/special_tokens_map.json',
 'wordpiece_tokenizer/tokenizer.json')

### Tokenized data

In [None]:
# Reclass data into PyTorch Dataset
class LMDataset(Dataset):
	def __init__(self, data):
		self.input_ids = data["input_ids"]
		self.labels = data["labels"]

	def __len__(self):
		return len(self.input_ids)

	def __getitem__(self, idx):
		return {
			"input_ids": torch.tensor(self.input_ids[idx]),
			"labels": torch.tensor(self.labels[idx]),
		}


train_dataset = LMDataset(data)

### Define decoder-only transformer

In [None]:
# Set up GPT-2 configuration and model
config = GPT2Config(
vocab_size=hf_tokenizer.vocab_size,
n_positions=block_size,
n_embd=256,
n_layer=4,
n_head=4,
bos_token_id=hf_tokenizer.bos_token_id,
eos_token_id=hf_tokenizer.eos_token_id,
)


model = GPT2LMHeadModel(config)

### Set up training

In [None]:
# Develop training arguments and Trainer
training_args = TrainingArguments(
    output_dir="./trained_gpt",
    overwrite_output_dir=True,
    max_steps=1000,
    per_device_train_batch_size=4,
    learning_rate=5e-4,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=400,
    report_to="none",
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
tokenizer=hf_tokenizer,
)


trainer.train()
trainer.save_model("trained_gpt")
hf_tokenizer.save_pretrained("trained_gpt")

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss
50,7.5933
100,7.1869
150,7.0901
200,7.0955
250,7.0624
300,7.0278
350,7.0295
400,6.9946
450,6.9328
500,6.9209


('trained_gpt/tokenizer_config.json',
 'trained_gpt/special_tokens_map.json',
 'trained_gpt/tokenizer.json')

## Generate sentences

In [92]:
# Text generation using the trained model
generator = pipeline(
    "text-generation",
    model="./trained_gpt",
    tokenizer="./trained_gpt",
    device=0 if torch.cuda.is_available() else -1,
)

prompt = "data from the study were collected"

outputs = generator(
    prompt,
    max_new_tokens=80,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    pad_token_id=hf_tokenizer.pad_token_id,
)

print(outputs[0]["generated_text"])

Device set to use cpu


data from the study were collected is the of of of of the was - of of.,,, a - of - -., percent ( )., - ) the of,,,, of the of. the of the and -, was of the of and. percent in -,,, to - -. the of. the to is -, in of.., is that to, of is that


## Calculate perplexity

In [None]:
# Calculate perplexity on held-out documents
def transformer_perplexity(model, tokenizer, texts):
	model.eval()
	total_loss = 0.0
	total_tokens = 0

	with torch.no_grad():
		for text in texts:
			if len(text) == 0:
				continue
			enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=block_size, padding=True)
			outputs = model(**enc, labels=enc["input_ids"])
			loss = outputs.loss
			total_loss += loss.item() * enc["input_ids"].size(1)
			total_tokens += enc["input_ids"].size(1)

	if total_tokens == 0:
		return float('inf')
	return math.exp(total_loss / total_tokens)


pp = transformer_perplexity(model, hf_tokenizer, heldout_docs)
print(f"Held-out perplexity: {pp:.2f}")

Held-out perplexity: 1242.55


## Conclusion

Even when trained on only 100 JSON files, the transformer-based decoder model trained faster than the trigram model. Although trigram models are simple per estimate, they require extensive counting and smoothing, which increases preprocessing and memory overhead. In contrast, the transformer benefits from mini-batch optimization and efficient matrix operations.

Predictively, the transformer achieved roughly half the perplexity of the trigram model, reflecting a better fit to the data. Its ability to capture long-range context and learn distributed token representations accounts for this improvement. These results highlight the superior efficiency and modeling capacity of neural language models over n-grams.