In [None]:

"""
Hybrid Cascade + QA: train the extractor and report accuracy (no DICE run)

"""


import sys, subprocess, random, re, time

def install(pkgs):

    for p in pkgs:
        try:
            __import__(p)
        except Exception:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', p])

install(['torch','transformers','datasets','fuzzysearch','numpy','pandas','sklearn'])


from fuzzysearch import find_near_matches
import numpy as np, pandas as pd, torch
from sklearn.model_selection import train_test_split
from datasets import Dataset as DS
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForQuestionAnswering,
    TrainingArguments,
    Trainer,
    pipeline
)


if torch.cuda.is_available():
    device = 'cuda'
    device_id = 0
else:
    device = 'cpu'
    device_id = -1
print('Using device:', device)

# Parameter list (include DICE-style parameters)
params = ['tax','year','prstp','a2base','elasmu','gA1','gsigma1','pback']

# Regex rules
regexes = {
    'tax': [
        r"(?:tax|price|fee|charge)\s*(?:of|@|=)?\s*\$?\s*(\d{1,4}(?:,\d{3})*(?:\.\d+)?)\b",
        r"\$(\d{1,4}(?:,\d{3})*(?:\.\d+)?)\s*(?:per\s*(?:ton|t|tonne))?\b"
    ],
    'year': [
        r"\b(?:year|in|by|start(?:ing)?|target)\s*'?(\d{4})\b",
        r"\b(20[2-9]\d|2100)\b"
    ],
    'prstp':[
        r"prstp\s*(?:to|is|=)?\s*(-?\d+\.\d+)",
        r"pure time pref.*?(-?\d+\.\d+)",
        r"prstp[:：]\s*(-?\d+\.\d+)",
    ],
    'a2base':[
        r"a2base\s*(?:to|is|=)?\s*(-?\d+\.\d+)",
        r"damage quad coeff.*?(-?\d+\.\d+)"
    ],
    'elasmu':[
        r"elasmu\s*(?:to|is|=)?\s*(-?\d+\.\d+)",
        r"util elasticity.*?(-?\d+\.\d+)"
    ],
    'gA1':[
        r"gA1\s*(?:to|is|=)?\s*(-?\d+\.\d+)",
        r"tfp growth init.*?(-?\d+\.\d+)"
    ],
    'gsigma1':[
        r"gsigma1\s*(?:to|is|=)?\s*(-?\d+\.\d+)",
        r"sigma decline init.*?(-?\d+\.\d+)"
    ],
    'pback': [
        r"(?:pback|backstop\s*(?:price|cost))\s*(?:=|to|is|at)?\s*\$?\s*(\d{2,4}(?:\.\d+)?)\b",
        r"backstop.*?\$?\s*(\d{2,4}(?:\.\d+)?)\b"
    ]
}

# Value ranges (used only for synthetic data + a tiny fuzzy fallback)
ranges = {
    "tax": (0, 1000),          # $/tCO2
    "year": (2020, 2100),
    "prstp": (0.0, 0.03),
    "elasmu": (0.5, 2.5),
    "a2base": (0.0005, 0.008),
    "gA1": (0.04, 0.12),
    "gsigma1": (-0.03, -0.003),
    "pback": (150, 1500)
}

# Generate synthetic data

def gen(n=2000, seed=42):
    # Create n noisy queries

    random.seed(seed)
    rows = []
    keys_extra = [k for k in params if k not in ['tax','year']]
    for _ in range(n):
        vals = {}
        for k in params:
            if k == 'tax':
                vals[k] = str(random.choice(range(0, 201)))
            elif k == 'year':
                vals[k] = str(random.choice((2020, 2100)))
            else:
                lo, hi = ranges[k]
                vals[k] = f"{random.uniform(lo,hi):.6f}"
        main = f"In {vals['year']}, set a carbon tax of ${vals['tax']}/ton."
        extras = ", ".join(f"{k} to {vals[k]}" for k in random.sample(keys_extra, len(keys_extra)))
        txt = main + " Also set " + extras + "."

        # Add light noise
        chars = list(txt); i = 0
        while i < len(chars):
            if random.random() < 0.005:
                op = random.choice(['swap','del','rep'])
                if op == 'swap' and i+1 < len(chars):
                    chars[i], chars[i+1] = chars[i+1], chars[i]; i += 1
                elif op == 'del':
                    chars.pop(i); i -= 1
                else:
                    chars[i] = random.choice('abcdefghijklmnopqrstuvwxyz')
            i += 1
        if random.random() < 0.01:
            ins = random.choice([' like',' um',' well'])
            pos = random.randint(0,len(chars))
            chars.insert(pos, ins)
        rows.append({'query': ''.join(chars), **vals})
    return pd.DataFrame(rows)

# Build dataset and split 70/30

df = gen(n=2000, seed=42)
df_train, df_val = train_test_split(df, test_size=0.3, random_state=42)

# Create QA records with fuzzy alignment (character start index)

def make_qa_records(df):
    """Turn rows into QA samples with answer spans for each parameter."""
    recs = []
    for idx, r in df.iterrows():
        for k in params:
            orig_txt = str(r[k])
            matches = find_near_matches(orig_txt, r.query, max_l_dist=2)
            if matches:
                m = matches[0]
                recs.append({
                    'id': f"{idx}-{k}",
                    'context': r.query,
                    'question': f"What is the {k}?",
                    'answers': {'text': [orig_txt], 'answer_start': [m.start]}
                })
    return DS.from_list(recs)

train_raw = make_qa_records(df_train)
eval_raw  = make_qa_records(df_val)

# Tokenizer and feature preparation
tok = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def prepare_features(examples):
    # Tokenize and compute start/end positions for spans

    tokenized = tok(
        examples['question'], examples['context'],
        truncation="only_second", max_length=128, stride=50,
        return_overflowing_tokens=True, return_offsets_mapping=True,
        padding="max_length",
    )
    sample_map = tokenized.pop("overflow_to_sample_mapping")
    offset_map = tokenized.pop("offset_mapping")
    start_positions, end_positions = [], []

    for i, offsets in enumerate(offset_map):
        sample_idx = sample_map[i]
        answers = examples['answers'][sample_idx]
        if len(answers['answer_start']) == 0:
            start_positions.append(0); end_positions.append(0)
        else:
            start_char = answers['answer_start'][0]
            end_char   = start_char + len(answers['text'][0])
            seq_ids = tokenized.sequence_ids(i)
            idx0 = seq_ids.index(1)
            idx1 = len(seq_ids) - 1 - seq_ids[::-1].index(1)

            # If the answer is not fully inside the kept context, mark (0,0)

            if not (offsets[idx0][0] <= start_char < offsets[idx1][1] and
                    offsets[idx0][0] < end_char <= offsets[idx1][1]):
                start_positions.append(0); end_positions.append(0)
            else:
                ts, te = idx0, idx1
                while ts <= idx1 and offsets[ts][0] <= start_char: ts += 1
                while te >= idx0 and offsets[te][1] >= end_char: te -= 1
                start_positions.append(ts-1); end_positions.append(te+1)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"]   = end_positions
    return tokenized

train_dataset = train_raw.map(prepare_features, batched=True, remove_columns=train_raw.column_names)
eval_dataset  = eval_raw.map(prepare_features,  batched=True, remove_columns=eval_raw.column_names)

# Model
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
if device == 'cuda':
    model = model.to(device)

# Training
args = TrainingArguments(
    output_dir='hybrid_v3',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=100,
    save_total_limit=1,
    report_to=['none'],
    no_cuda=(device == 'cpu')
)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tok
)
trainer.train()

# QA pipeline after fine-tuning
qa_pipe = pipeline('question-answering', model=model, tokenizer=tok, device=device_id)

# Simple numeric cleaner
def clean_number(s):
    """Strip $, %, commas; keep numeric strings only."""
    if not s: return None
    s = re.sub(r'[,\$\%\°]', '', s.strip())
    return s if re.match(r"^-?\d*\.?\d+$", s) else None

# Extractor: regex to tiny fuzzy boundary hint to QA fallback
th = 0.7
num_pat = re.compile(r"-?\d+(?:\.\d+)?")

def extract(q):
    """Extract all params from one query string."""
    r = {}
    for k in params:
        val = None

        # Regex rules

        for pat in regexes[k]:
            m = re.search(pat, q, flags=re.IGNORECASE)
            if m:
                val = m.group(1)
                break

        # light fuzzy hint using range endpoints (only if still None)

        if not val and k in ranges:
            for cand in [str(int(ranges[k][0])), str(int(ranges[k][1]))]:
                if find_near_matches(cand, q, max_l_dist=1):
                    val = cand; break
        # QA fallback

        if not val:
            out = qa_pipe(question=f"What is the {k}?", context=q)
            if out['score'] >= th and num_pat.fullmatch(out['answer'].strip()):
                val = out['answer'].strip()
        r[k] = clean_number(val) if val else None
    return r

# Evaluate on validation set

acc = {k:0 for k in params}
n = len(df_val)
for row in df_val.itertuples():
    res = extract(row.query)
    for k in params:
        if res[k] == getattr(row, k):
            acc[k] += 1

print({k: round(acc[k]/n, 4) for k in params})


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/11199 [00:00<?, ? examples/s]

Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,2.317
200,0.3359
300,0.2317
400,0.173
500,0.1842
600,0.1377
700,0.124
800,0.1161
900,0.0984
1000,0.0829


Device set to use cuda:0


{'tax': 0.985, 'year': 0.9867, 'prstp': 0.885, 'a2base': 0.9283, 'elasmu': 0.8867, 'gA1': 0.915, 'gsigma1': 0.9183, 'pback': 0.905}
