Comparing GPT2-Medium and QA models in the presence of noise

In [None]:
import os, json, time, random, re, subprocess, sys

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    DistilBertTokenizerFast,
    DistilBertForQuestionAnswering,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    pipeline,
)
from peft import get_peft_model, LoraConfig, TaskType

# Extra dependency
try:
    from fuzzysearch import find_near_matches
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "fuzzysearch"])
    from fuzzysearch import find_near_matches

# Basic setup

device = 0 if torch.cuda.is_available() else -1
random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Generate 5,000 noisy samples

def introduce_noise(text, typo_prob=0.05):
    """Inject simple character-level noise and occasional fillers."""
    chars = list(text)
    i = 0
    while i < len(chars):
        if random.random() < typo_prob:
            op = random.choice(['swap','delete','replace'])
            if op == 'swap' and i+1 < len(chars):
                chars[i], chars[i+1] = chars[i+1], chars[i]; i += 1
            elif op == 'delete':
                del chars[i]; i -= 1
            else:
                chars[i] = random.choice('abcdefghijklmnopqrstuvwxyz')
        i += 1
    result = ''.join(chars)

    # Add a small chance of colloquial fillers

    if random.random() < 0.1:
        insert = random.choice([' like', ' you know', ' um', ' uh'])
        pos = random.randint(0, len(result))
        result = result[:pos] + insert + result[pos:]
    return result

TEMPLATES = [
    "Set a carbon tax of ${tax} per ton in {year}.",
    "In {year}, impose a tax of ${tax}/t.",
    "Apply ${tax} carbon levy by {year}.",
    "By year {year}, tax CO2 at ${tax} per tonne.",
    "What about charging ${tax} tax in {year}?"
]

def generate_noisy_samples(n=5000, seed=42):
    """Create a DataFrame of noisy queries with gold tax/year labels."""
    random.seed(seed)
    rows = []
    for _ in range(n):
        tax = random.choice(range(0, 201, 1))
        year = random.choice(range(2020, 2100))
        clean = random.choice(TEMPLATES).format(tax=tax, year=year)
        rows.append({"query": introduce_noise(clean), "tax": str(tax), "year": str(year)})
    return pd.DataFrame(rows)

# Cascade extractor (regex + fuzzy matching)
TAXES = [str(t) for t in range(0, 201, 1)]
YEARS = [str(y) for y in range(2020, 2100)]

_tax_re_list = [
    re.compile(r"\$\s*(\d{1,3})"),   # $50
    re.compile(r"\b(?:USD|usd)\s*(\d{1,3})\b"),   # USD 50
    re.compile(r"\b(?:tax|levy|carbon)\D{0,10}?(\d{1,3})\b") # tax (......) 50
]
_year_re = re.compile(r"\b(20\d{2})\b", re.IGNORECASE)

_fillers = re.compile(r"\b(like|you know|um|uh)\b", re.IGNORECASE)

def extract_cascade(text):
    """First try regex on a filler-cleaned text; then back off to fuzzy matching."""
    txt = _fillers.sub(" ", text)

    # year
    m_y = _year_re.search(txt)
    year = m_y.group(1) if m_y else None

    # tax

    tax = None
    for r in _tax_re_list:
        m = r.search(txt)
        if m:
            tax = m.group(1)
            break

    # fuzzy backoff
    if year is None:
        for y in YEARS:
            if find_near_matches(y, txt, max_l_dist=1):
                year = y; break
    if tax is None:
        for tval in TAXES:
            if find_near_matches(tval, txt, max_l_dist=2):
                tax = tval; break
    return tax, year

# Data
df = generate_noisy_samples(5000)
df_train, df_val = train_test_split(df, test_size=0.3, random_state=42)

# DistilBERT QA training data
def make_qa_records(df_):
    """Build simple SQuAD-style QA records for tax/year."""
    recs = []
    for _, r in df_.iterrows():
        for key in ['tax', 'year']:
            txt = str(r[key])
            start = r.query.find(txt)
            if start >= 0:
                recs.append({
                    'context': r.query,
                    'question': f"What is the {key}?",
                    'answers': {'text':[txt], 'answer_start':[start]}
                })
    return recs

train_qa = Dataset.from_list(make_qa_records(df_train))
val_qa   = Dataset.from_list(make_qa_records(df_val))

tokenizer_q = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model_q     = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased').to(device)

def prepare_features(ex):

    # Tokenize QA pairs and map character offsets to token start and end positions.

    enc = tokenizer_q(
        ex['question'], ex['context'],
        truncation='only_second', max_length=128,
        padding='max_length', return_offsets_mapping=True
    )
    starts, ends = [], []
    for i, off in enumerate(enc.offset_mapping):
        sc = ex['answers'][i]['answer_start'][0]
        ec = sc + len(ex['answers'][i]['text'][0])
        seq = enc.sequence_ids(i)
        c_idx = [j for j, s in enumerate(seq) if s == 1]
        if not c_idx:
            starts.append(0); ends.append(0); continue
        c0, c1 = c_idx[0], c_idx[-1]
        s, e = c0, c0
        for idx in range(c0, c1+1):
            if off[idx][0] <= sc < off[idx][1]: s = idx
            if off[idx][0] < ec <= off[idx][1]: e = idx
        starts.append(s); ends.append(e)
    enc['start_positions'] = starts
    enc['end_positions']   = ends
    return enc

train_ds = train_qa.map(prepare_features, batched=True, remove_columns=train_qa.column_names)
val_ds   = val_qa.map(prepare_features, batched=True, remove_columns=val_qa.column_names)

trainer = Trainer(
    model=model_q,
    args=TrainingArguments(
        output_dir='dbert_q_cascade',
        num_train_epochs=5,
        per_device_train_batch_size=4,
        learning_rate=1e-4,
        logging_steps=50,
        report_to=['none'],
        no_cuda=(device == -1)
    ),
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer_q
)

t0 = time.time()
trainer.train()
time_cascade_dbert = time.time() - t0

# Cascade + QA fallback  evaluation
qa_pipe = pipeline('question-answering', model=model_q, tokenizer=tokenizer_q, device=device)

acc_cascade = {'tax': 0, 'year': 0}
n_val = len(df_val)
for r in df_val.itertuples():
    t_c, y_c = extract_cascade(r.query)
    t_out = t_c if t_c is not None else qa_pipe(question="What is the tax?",  context=r.query)['answer'].strip()
    y_out = y_c if y_c is not None else qa_pipe(question="What is the year?", context=r.query)['answer'].strip()
    if str(t_out) == str(r.tax):  acc_cascade['tax']  += 1
    if str(y_out) == str(r.year): acc_cascade['year'] += 1
for k in acc_cascade:
    acc_cascade[k] = acc_cascade[k] / n_val

# GPT-2 medium + LoRA fine-tuning and evaluation
def train_eval_lora_medium(df_train_, df_val_):
    """LoRA fine-tune GPT-2 Medium and compute EM on noisy validation data."""
    samples = []
    for _, r in df_train_.iterrows():
        prompt = f'User query: "{r.query}"\nExtract JSON:'
        completion = ' ' + json.dumps({'tax': r.tax, 'year': r.year})
        samples.append({'prompt': prompt, 'completion': completion})

    ds = Dataset.from_pandas(pd.DataFrame(samples))
    tok = AutoTokenizer.from_pretrained('gpt2-medium', use_fast=True)
    tok.pad_token = tok.eos_token
    tok.pad_token_id = tok.eos_token_id

    base = AutoModelForCausalLM.from_pretrained('gpt2-medium').to(device)
    base.config.pad_token_id = tok.eos_token_id

    model = get_peft_model(
        base,
        LoraConfig(task_type=TaskType.CAUSAL_LM, r=4, lora_alpha=32, lora_dropout=0.1)
    )

    def pre(ex):
        txt = ex['prompt'] + ex['completion']
        enc = tok(txt, truncation=True, padding='max_length', max_length=128)
        l0  = len(tok(ex['prompt'], truncation=True, max_length=128).input_ids)
        lab = [-100] * l0 + enc.input_ids[l0:]
        lab += [-100] * (128 - len(lab))
        return {'input_ids': enc.input_ids, 'attention_mask': enc.attention_mask, 'labels': lab}

    tr_ds = ds.map(pre, remove_columns=ds.column_names)

    tr = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir='gpt2m_lora_noise',
            num_train_epochs=5,
            per_device_train_batch_size=4,
            learning_rate=1e-4,
            report_to=['none'],
            no_cuda=(device == -1)
        ),
        train_dataset=tr_ds,
        tokenizer=tok
    )

    t1 = time.time()
    tr.train()
    time_lora = time.time() - t1

    gen = pipeline('text-generation', model=model, tokenizer=tok, device=device, return_full_text=False)
    acc = {}
    for k in ['tax', 'year']:
        corr = 0
        for r in df_val_.itertuples():
            prompt = f'User query: "{r.query}"\nExtract JSON:'
            out = gen(prompt, max_new_tokens=40, do_sample=False)[0]['generated_text']
            js = {}
            if '{' in out and '}' in out:
                try:
                    js = json.loads(out[out.find('{'): out.rfind('}')+1])
                except Exception:
                    js = {}
            if str(js.get(k)) == str(getattr(r, k)):
                corr += 1
        acc[k] = corr / len(df_val_)
    return acc, time_lora

acc_lora, time_lora = train_eval_lora_medium(df_train, df_val)

# Show Comparison
print('Cascade + DistilBERT QA acc:', acc_cascade, 'time:', round(time_cascade_dbert, 1), 's')
print('GPT-2 medium+LoRA noisy acc:', acc_lora, 'time:', round(time_lora, 1), 's')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5833 [00:00<?, ? examples/s]

Map:   0%|          | 0/2518 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
50,0.7386
100,0.0781
150,0.0905
200,0.0622
250,0.1588
300,0.0937
350,0.1089
400,0.0806
450,0.0646
500,0.0771


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

  tr = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,0.6418
1000,0.0154
1500,0.0141
2000,0.0128
2500,0.0128
3000,0.0125
3500,0.0117
4000,0.0114


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are no

Cascade + DistilBERT QA acc: {'tax': 0.856, 'year': 0.82} time: 179.4 s
GPT-2 medium+LoRA noisy acc: {'tax': 0.8793333333333333, 'year': 0.89} time: 347.0 s
