#Git

In [3]:
!git config --global user.email "erfan.alerom@gmail.com"
!git config --global user.name "erythm"

In [7]:
!cp /content/drive/MyDrive/ColabNotebooks/extention.ipynb /content/askqe/

cp: cannot stat '/content/drive/MyDrive/ColabNotebooks/extention.ipynb': No such file or directory


#Structured Binary AskQE Extension  
Pipeline: SRL / OpenIE / Dependency /NER (modular, skippable)

In [4]:
!git clone https://github.com/erythm/askqe.git && %cd askqe

Cloning into 'askqe'...
remote: Enumerating objects: 1119, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 1119 (delta 14), reused 5 (delta 5), pack-reused 1102 (from 2)[K
Receiving objects: 100% (1119/1119), 52.45 MiB | 15.46 MiB/s, done.
Resolving deltas: 100% (886/886), done.
Updating files: 100% (1037/1037), done.
/bin/bash: line 1: fg: no job control


In [5]:
%cd askqe

/content/askqe


# Requirments

In [5]:
# Install necessary libraries
# vLLM requires specific installation on Colab
!pip install -q vllm
!pip install -q sentence-transformers sacrebleu deep_translator nltk
!pip install -q "transformers>=4.56.0,<5.0.0"  # Fix version conflict # For NLI/Scoring

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m121.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import sys
import os
import importlib.util

REPO_PATH = "askqe"

# Add repository paths to Python path
sys.path.insert(0, REPO_PATH)
sys.path.insert(0, f"{REPO_PATH}/QG/code")
sys.path.insert(0, f"{REPO_PATH}/QA/code")
sys.path.insert(0, f"{REPO_PATH}/biomqm/askqe")
sys.path.insert(0, f"{REPO_PATH}/evaluation/string-comparison")

# --- Import Prompts from Repository ---
from QG.code.prompt import nli as qg_nli_prompt
from QG.code.prompt import vanilla as qg_vanilla_prompt
from QG.code.prompt import prompts as qg_prompts

from QA.code.prompt import qa_prompt

from biomqm.askqe.prompt import atomic_fact_prompt
from biomqm.askqe.prompt import nli as biomqm_nli_prompt

# --- Import utils using importlib ---
utils_path = f"{REPO_PATH}/evaluation/string-comparison/utils.py"
spec = importlib.util.spec_from_file_location("utils_module", utils_path)
utils_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(utils_module)

# Assign functions
repo_f1_score = utils_module.f1_score
repo_exact_match_score = utils_module.exact_match_score
repo_chrf_score = utils_module.chrf_score
repo_bleu_score = utils_module.bleu_score
repo_compare_answers = utils_module.compare_answers
normalize_answer = utils_module.normalize_answer

print("✅ Repository imports loaded successfully!")
print(f"   - QG prompts: vanilla, nli, srl")
print(f"   - QA prompt: qa_prompt")
print(f"   - Atomic fact prompt: atomic_fact_prompt")
print(f"   - Scoring functions: f1_score, exact_match_score, chrf_score, bleu_score")

atomic_fact_prompt_template = atomic_fact_prompt
qg_prompt_template = qg_nli_prompt
qa_prompt_template = qa_prompt

print("\n✅ Prompt templates configured:")
print(f"   atomic_fact_prompt_template = biomqm/askqe/prompt.py::atomic_fact_prompt")
print(f"   qg_prompt_template = QG/code/prompt.py::nli (best config per paper)")
print(f"   qa_prompt_template = QA/code/prompt.py::qa_prompt")

ModuleNotFoundError: No module named 'QG'

## 2.5 Load Qwen2.5-3B Model with vLLM

In [2]:
import os
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

from vllm import LLM, SamplingParams

# Configuration for vLLM with AWQ quantization
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct-AWQ"

print(f"Loading model with vLLM: {MODEL_ID}...")

try:
    llm = LLM(
        model=MODEL_ID,
        quantization="awq",
        dtype="half",
        trust_remote_code=True,
        max_model_len=2048,
        enforce_eager=True,  # Disables CUDA graph for better Colab compatibility
        gpu_memory_utilization=0.7
    )
    print("✅ vLLM Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading with vLLM: {e}")
    print("Please ensure vLLM is properly installed and GPU is available.")

Loading model with vLLM: Qwen/Qwen2.5-7B-Instruct-AWQ...
INFO 12-25 22:21:50 [utils.py:253] non-default args: {'trust_remote_code': True, 'dtype': 'half', 'max_model_len': 2048, 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'quantization': 'awq', 'enforce_eager': True, 'model': 'Qwen/Qwen2.5-7B-Instruct-AWQ'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


INFO 12-25 22:21:51 [model.py:514] Resolved architecture: Qwen2ForCausalLM
INFO 12-25 22:21:51 [model.py:1661] Using max model len 2048
INFO 12-25 22:21:55 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.


Parse safetensors files:   0%|          | 0/2 [00:00<?, ?it/s]

INFO 12-25 22:21:56 [vllm.py:722] Cudagraph is disabled under eager mode
INFO 12-25 22:22:51 [llm.py:360] Supported tasks: ['generate']
✅ vLLM Model loaded successfully!


#Helper functions

In [3]:
SAMPLING_PARAMS = SamplingParams(
    temperature=0.3,
    top_p=0.95,
    max_tokens=256,
    stop=["]", "\n\n", "```", "<|im_end|>"]
)

def generate_text_batch(prompts, sampling_params=SAMPLING_PARAMS):
    if not prompts:
        return []

    formatted_prompts = [
        f"<|im_start|>user\n{p}<|im_end|>\n<|im_start|>assistant\n"
        for p in prompts
    ]

    outputs = llm.generate(formatted_prompts, sampling_params)

    generated_texts = []
    for output in outputs:
        text = output.outputs[0].text.strip()
        # Fix brackets if needed
        if not text.endswith("]"):
            text = text + "]"
        if not text.startswith("["):
            text = "[" + text
        generated_texts.append(text)

    return generated_texts

def parse_list_output(text):
    """Parses a string representation of a list into a Python list."""
    import ast
    try:
        # Try to find the list part if there's extra text
        start = text.find('[')
        end = text.rfind(']') + 1
        if start != -1 and end != -1:
            candidate = text[start:end]
            return ast.literal_eval(candidate)
        return []
    except:
        return []

print("Helper functions defined!")

Helper functions defined!


#Load Data

In [12]:
import json
import os
import glob

# --- Configuration ---
# Set this to "biomqm" or "contratico"
DATASET_TO_USE = "biomqm"
# For CONTRATICO, specify language pair
CONTRATICO_LANG_PAIR = "en-es"

def calculate_mqm_score(errors):
    # Standard WMT MQM weights
    weights = {"Minor": 1, "Major": 5, "Critical": 25}
    score_penalty = 0
    for error in errors:
        severity = error.get("severity", "Minor")
        score_penalty += weights.get(severity, 1)
    return max(0, 100 - score_penalty)

def get_max_severity(errors):
    if not errors:
        return "No Error"
    severities = [e.get("severity", "Minor") for e in errors]
    if "Critical" in severities:
        return "Critical"
    if "Major" in severities:
        return "Major"
    return "Minor"

def load_biomqm_data(file_path, limit=None):
    data_entries = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if limit and i >= limit:
                    break
                item = json.loads(line)

                # BIOMQM structure: src, tgt, bt_tgt, errors_tgt
                entry = {
                    'id': item.get('doc_id', f'doc_{i}'),
                    'source': item['src'],
                    'backtranslation': item.get('bt_tgt', ''),
                    'errors': item.get('errors_tgt', []),
                    'mqm_score': calculate_mqm_score(item.get('errors_tgt', [])),
                    'severity': get_max_severity(item.get('errors_tgt', []))
                }
                data_entries.append(entry)

        print(f"✅ Loaded {len(data_entries)} entries from BIOMQM successfully.")
        return data_entries
    except FileNotFoundError:
        print(f"❌ Error: File {file_path} not found.")
        return []

# Path to Data
REPO_PATH = "askqe"

if DATASET_TO_USE == "biomqm":
    BIOMQM_FILE = f"{REPO_PATH}/biomqm/dev_with_backtranslation.jsonl"
    dataset = load_biomqm_data(BIOMQM_FILE, limit=100)  # Start with 100 samples for testing
else:
    print(f"❌ Unknown dataset: {DATASET_TO_USE}")
    dataset = []

if dataset:
    print("\n📊 First Entry Example:")
    print(json.dumps(dataset[0], indent=2))

✅ Loaded 100 entries from BIOMQM successfully.

📊 First Entry Example:
{
  "id": "doc11",
  "source": "Three cases of cervicofacial NF are presented in this case report.",
  "backtranslation": "This case report presents three cases of cervicofacial NF.",
  "errors": [
    {
      "term": "zervikofazialem",
      "startIndex": 44,
      "endIndex": 58,
      "error_category": "Linguistic_conventions",
      "error_subcategory": "Grammar",
      "severity": "Major"
    }
  ],
  "mqm_score": 95,
  "severity": "Major"
}


In [14]:
from transformers import pipeline as hf_pipeline
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
from collections import Counter
from sacrebleu import sentence_bleu, sentence_chrf
import json

# --- PRELIMINARY CONFIGURATION ---
BATCH_SIZE = 10  # Small batch for testing
MAX_SAMPLES = 20  # Test with 20 samples first

# Create a subset of the original dataset
dataset_subset = dataset[:MAX_SAMPLES]
print(f"🔬 Testing with {len(dataset_subset)} samples.")

print("Loading DeBERTa NLI model...")
nli_pipeline = hf_pipeline("text-classification", model="potsawee/deberta-v3-large-mnli", device=0)

results = []

print(f"🚀 Starting Pipeline Execution with Batch Size {BATCH_SIZE}...\n")

for i in range(0, len(dataset_subset), BATCH_SIZE):
    batch_entries = dataset_subset[i:i+BATCH_SIZE]
    current_ids = [e['id'] for e in batch_entries]
    print(f"⚙️  Processing Batch {i//BATCH_SIZE + 1} (IDs {current_ids[0]} to {current_ids[-1]})...")

    # Initialize containers for this batch
    batch_final_facts = [[] for _ in range(len(batch_entries))]
    batch_final_questions = [[] for _ in range(len(batch_entries))]
    batch_final_answers_src = [[] for _ in range(len(batch_entries))]
    batch_final_answers_bt = [[] for _ in range(len(batch_entries))]

    # --- Step 1: Atomic Fact Extraction ---
    print("  📝 Step 1: Extracting atomic facts...")
    prompts_facts = [atomic_fact_prompt_template.replace("{{sentence}}", e['source']) for e in batch_entries]
    facts_str_list = generate_text_batch(prompts_facts)
    raw_batch_facts = [parse_list_output(s) for s in facts_str_list]

    # --- Step 1.5: Entailment Filtering (DeBERTa) ---
    print("  🔍 Step 1.5: NLI filtering...")
    flat_nli_inputs = []
    for idx, facts in enumerate(raw_batch_facts):
        if not facts:
            continue
        source = batch_entries[idx]['source']
        for fact in facts:
            text_input = f"{source} [SEP] {fact}"
            flat_nli_inputs.append({'batch_idx': idx, 'fact': fact, 'text_input': text_input})

    if flat_nli_inputs:
        nli_texts = [item['text_input'] for item in flat_nli_inputs]
        try:
            nli_results = nli_pipeline(nli_texts, batch_size=BATCH_SIZE, truncation=True, max_length=512)
            for item, res in zip(flat_nli_inputs, nli_results):
                label = res['label'].upper()
                if "CONTRADICTION" not in label:
                    batch_final_facts[item['batch_idx']].append(item['fact'])
        except Exception as e:
            print(f"  ⚠️  Error in NLI step: {e}")
            for item in flat_nli_inputs:
                batch_final_facts[item['batch_idx']].append(item['fact'])
    else:
        batch_final_facts = raw_batch_facts

    # --- Step 2: Question Generation ---
    print("  ❓ Step 2: Generating questions...")
    prompts_qg = []
    valid_indices_qg = []

    for idx, facts in enumerate(batch_final_facts):
        if not facts:
            continue
        prompt = qg_prompt_template.replace("{{sentence}}", batch_entries[idx]['source']).replace("{{atomic_facts}}", str(facts))
        prompts_qg.append(prompt)
        valid_indices_qg.append(idx)

    if prompts_qg:
        qg_results_str = generate_text_batch(prompts_qg)
        for valid_idx, res_str in zip(valid_indices_qg, qg_results_str):
            batch_final_questions[valid_idx] = parse_list_output(res_str)

    # --- Step 3 & 4: QA on Source & Backtranslation ---
    print("  💬 Step 3 & 4: Answering questions...")
    prompts_qa_src = []
    prompts_qa_bt = []
    valid_indices_qa = []

    for idx, questions in enumerate(batch_final_questions):
        if not questions:
            continue

        p_src = qa_prompt_template.replace("{{sentence}}", batch_entries[idx]['source']).replace("{{questions}}", str(questions))
        prompts_qa_src.append(p_src)

        bt_sent = batch_entries[idx]['backtranslation']
        if bt_sent:
            p_bt = qa_prompt_template.replace("{{sentence}}", bt_sent).replace("{{questions}}", str(questions))
            prompts_qa_bt.append(p_bt)
        else:
            prompts_qa_bt.append("No context provided.")

        valid_indices_qa.append(idx)

    if prompts_qa_src:
        combined_prompts = prompts_qa_src + prompts_qa_bt
        all_answers = generate_text_batch(combined_prompts)

        split_idx = len(prompts_qa_src)
        answers_src = all_answers[:split_idx]
        answers_bt = all_answers[split_idx:]

        for k, batch_idx in enumerate(valid_indices_qa):
            batch_final_answers_src[batch_idx] = parse_list_output(answers_src[k])
            batch_final_answers_bt[batch_idx] = parse_list_output(answers_bt[k])

    # --- FINAL SAVING ---
    for idx, entry in enumerate(batch_entries):
        results.append({
            'id': entry['id'],
            'source': entry['source'],
            'backtranslation': entry.get('backtranslation', ""),
            'facts': batch_final_facts[idx],
            'questions': batch_final_questions[idx],
            'answers_src': batch_final_answers_src[idx],
            'answers_bt': batch_final_answers_bt[idx],
            'mqm_score': entry.get('mqm_score', None),
            'severity': entry.get('severity', None)
        })

print("\n✅ Pipeline execution complete.")
print(f"📊 Processed {len(results)} samples.")
print(f"\n📄 First result example:")
print(json.dumps(results[0], indent=2))

🔬 Testing with 20 samples.
Loading DeBERTa NLI model...


Device set to use cuda:0


🚀 Starting Pipeline Execution with Batch Size 10...

⚙️  Processing Batch 1 (IDs doc11 to doc11)...
  📝 Step 1: Extracting atomic facts...


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  🔍 Step 1.5: NLI filtering...
  ❓ Step 2: Generating questions...


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  💬 Step 3 & 4: Answering questions...


Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/20 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

⚙️  Processing Batch 2 (IDs doc11 to doc56)...
  📝 Step 1: Extracting atomic facts...


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  🔍 Step 1.5: NLI filtering...
  ❓ Step 2: Generating questions...


Adding requests:   0%|          | 0/9 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/9 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  💬 Step 3 & 4: Answering questions...


Adding requests:   0%|          | 0/18 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/18 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


✅ Pipeline execution complete.
📊 Processed 20 samples.

📄 First result example:
{
  "id": "doc11",
  "source": "Three cases of cervicofacial NF are presented in this case report.",
  "backtranslation": "This case report presents three cases of cervicofacial NF.",
  "facts": [
    "Three cases of cervicofacial NF are presented in this case report."
  ],
  "questions": [
    "How many cases of cervicofacial NF are presented in this case report?"
  ],
  "answers_src": [
    3
  ],
  "answers_bt": [
    "Three"
  ],
  "mqm_score": 95,
  "severity": "Major"
}


#Install lib for Extension

In [16]:
# Install libraries for Extension pipeline
!pip install -q spacy
!python -m spacy download en_core_web_sm
!pip install -q allennlp allennlp-models
print("✅ Extension libraries installed!")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25h  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpip subprocess to install build dependencies[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See

In [4]:
import spacy

# Load spaCy model
print("Loading spaCy for Dependency Parsing & NER...")
nlp = spacy.load("en_core_web_sm")
print("✅ spaCy loaded!")

# Test it
test_text = "Three cases of cervicofacial NF are presented in this case report."
doc = nlp(test_text)

print("\n🔍 Testing Dependency Parsing:")
for token in doc:
    if token.dep_ in ["neg", "ROOT"]:
        print(f"  {token.text:15} {token.dep_:10} {token.head.text}")

print("\n🏷️ Testing NER:")
for ent in doc.ents:
    print(f"  {ent.text:20} {ent.label_}")

Loading spaCy for Dependency Parsing & NER...
✅ spaCy loaded!

🔍 Testing Dependency Parsing:
  presented       ROOT       presented

🏷️ Testing NER:
  Three                CARDINAL
  NF                   ORG


#SRL (Semantic Role Labeling)

In [5]:
# Try simpler installation for AllenNLP
!pip install -q allennlp==2.10.1 allennlp-models==2.10.1

# Test import
try:
    from allennlp.predictors.predictor import Predictor
    print("✅ AllenNLP imported successfully!")

    # Load SRL model
    print("Loading SRL model...")
    srl_predictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz"
    )
    print("✅ SRL model loaded!")

    # Test
    test_result = srl_predictor.predict(
        sentence="Three cases of cervicofacial NF are presented in this case report."
    )
    print("\n🔍 SRL Test Output:")
    print(test_result['verbs'][0] if test_result['verbs'] else "No verbs found")

except Exception as e:
    print(f"❌ AllenNLP error: {e}")
    print("\n⚠️ We'll skip SRL and use simpler alternatives")

[31mERROR: Ignored the following versions that require a different python version: 0.2.0 Requires-Python ==3.6[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement torch<1.13.0,>=1.10.0 (from allennlp) (from versions: 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.7.1, 2.8.0, 2.9.0, 2.9.1)[0m[31m
[0m[31mERROR: No matching distribution found for torch<1.13.0,>=1.10.0[0m[31m
[0m❌ AllenNLP error: No module named 'allennlp'

⚠️ We'll skip SRL and use simpler alternatives


###OpenIE (⚠️ SRL Skipped - Dependency Conflict)

**Issue:** AllenNLP (required for SRL) has incompatible PyTorch version requirements:
- AllenNLP needs: `torch<1.13.0`
- vLLM needs: `torch>=2.0`

**Solution:** We skip SRL and use **OpenIE** instead, which provides similar semantic extraction (subject-relation-object triples) using spaCy's dependency parsing, without external dependencies.

**Impact:** Minimal - OpenIE covers most of SRL's functionality for our binary question generation pipeline.

In [6]:
# OpenIE with spaCy (simpler alternative)
import spacy
from itertools import combinations

def extract_openie_triples(text):
    """
    Extract (subject, relation, object) triples using spaCy dependency parsing
    Simple OpenIE without external libraries
    """
    doc = nlp(text)
    triples = []

    for token in doc:
        # Find ROOT verb
        if token.pos_ == "VERB":
            # Find subject
            subjects = [w for w in token.children if w.dep_ in ("nsubj", "nsubjpass")]
            # Find objects
            objects = [w for w in token.children if w.dep_ in ("dobj", "pobj", "attr")]

            for subj in subjects:
                for obj in objects:
                    # Get full phrases (with children)
                    subj_phrase = " ".join([w.text for w in subj.subtree])
                    obj_phrase = " ".join([w.text for w in obj.subtree])
                    relation = token.text

                    triples.append({
                        "subject": subj_phrase,
                        "relation": relation,
                        "object": obj_phrase
                    })

    return triples

# Test OpenIE
test_text = "Three cases of cervicofacial NF are presented in this case report."
triples = extract_openie_triples(test_text)

print("✅ OpenIE function defined!")
print("\n🔍 Testing OpenIE extraction:")
for triple in triples:
    print(f"  ({triple['subject']}, {triple['relation']}, {triple['object']})")

if not triples:
    print("  No triples extracted (simple sentences may not have clear subject-object pairs)")

✅ OpenIE function defined!

🔍 Testing OpenIE extraction:
  No triples extracted (simple sentences may not have clear subject-object pairs)


In [7]:
# Better test for OpenIE
test_sentences = [
    "Patients were administered 5mg of morphine.",
    "The doctor diagnosed three cases of NF.",
    "COVID-19 affects the respiratory system."
]

print("🔍 Testing OpenIE with multiple sentences:\n")
for sent in test_sentences:
    triples = extract_openie_triples(sent)
    print(f"Sentence: {sent}")
    if triples:
        for triple in triples:
            print(f"  → ({triple['subject']}, {triple['relation']}, {triple['object']})")
    else:
        print(f"  → No triples found")
    print()

🔍 Testing OpenIE with multiple sentences:

Sentence: Patients were administered 5mg of morphine.
  → No triples found

Sentence: The doctor diagnosed three cases of NF.
  → (The doctor, diagnosed, three cases of NF)

Sentence: COVID-19 affects the respiratory system.
  → (COVID-19, affects, the respiratory system)



#Dependency Parsing (negation, modifiers)

In [8]:
def extract_dependency_features(text):
    """
    Extract dependency parsing features:
    - Negations
    - Modifiers (adjectives, adverbs)
    - Important grammatical relations
    """
    doc = nlp(text)
    features = {
        "negations": [],
        "modifiers": [],
        "has_negation": False
    }

    # Find negations
    for token in doc:
        if token.dep_ == "neg":
            # Get the negated word
            negated_word = token.head.text
            features["negations"].append({
                "negation": token.text,
                "negated_word": negated_word,
                "scope": " ".join([w.text for w in token.head.subtree])
            })
            features["has_negation"] = True

    # Find modifiers
    for token in doc:
        if token.pos_ in ["ADJ", "ADV"]:
            modified_word = token.head.text
            features["modifiers"].append({
                "modifier": token.text,
                "type": token.pos_,
                "modifies": modified_word
            })

    return features

# Test Dependency Parsing
test_sentences = [
    "The virus is not contagious.",
    "Patients were not administered morphine.",
    "The highly effective treatment was used.",
    "Three cases of cervicofacial NF are presented in this case report."
]

print("✅ Dependency Parsing function defined!\n")
print("🔍 Testing Dependency Features:\n")

for sent in test_sentences:
    features = extract_dependency_features(sent)
    print(f"Sentence: {sent}")
    print(f"  Has negation: {features['has_negation']}")

    if features['negations']:
        print(f"  Negations:")
        for neg in features['negations']:
            print(f"    - '{neg['negation']}' negates '{neg['negated_word']}' (scope: {neg['scope']})")

    if features['modifiers']:
        print(f"  Modifiers:")
        for mod in features['modifiers'][:3]:  # Show first 3
            print(f"    - '{mod['modifier']}' ({mod['type']}) modifies '{mod['modifies']}'")
    print()

✅ Dependency Parsing function defined!

🔍 Testing Dependency Features:

Sentence: The virus is not contagious.
  Has negation: True
  Negations:
    - 'not' negates 'is' (scope: The virus is not contagious .)
  Modifiers:
    - 'contagious' (ADJ) modifies 'is'

Sentence: Patients were not administered morphine.
  Has negation: True
  Negations:
    - 'not' negates 'administered' (scope: Patients were not administered)

Sentence: The highly effective treatment was used.
  Has negation: False
  Modifiers:
    - 'highly' (ADV) modifies 'effective'
    - 'effective' (ADJ) modifies 'treatment'

Sentence: Three cases of cervicofacial NF are presented in this case report.
  Has negation: False
  Modifiers:
    - 'cervicofacial' (ADJ) modifies 'NF'



#NER + Coreference

In [9]:
def extract_ner_entities(text):
    """
    Extract Named Entities (NER)
    Note: Coreference resolution requires neuralcoref which has compatibility issues.
    We'll do NER only for now (entity consistency check).
    """
    doc = nlp(text)
    entities = []

    for ent in doc.ents:
        entities.append({
            "text": ent.text,
            "label": ent.label_,
            "start": ent.start_char,
            "end": ent.end_char
        })

    return entities

def check_entity_consistency(source_entities, bt_entities):
    """
    Check if entities are consistent between source and backtranslation
    """
    source_texts = set([e['text'].lower() for e in source_entities])
    bt_texts = set([e['text'].lower() for e in bt_entities])

    missing_in_bt = source_texts - bt_texts
    extra_in_bt = bt_texts - source_texts

    return {
        "source_entities": source_entities,
        "bt_entities": bt_entities,
        "missing_in_bt": list(missing_in_bt),
        "extra_in_bt": list(extra_in_bt),
        "is_consistent": len(missing_in_bt) == 0 and len(extra_in_bt) == 0
    }

# Test NER
test_pairs = [
    {
        "source": "Three cases of cervicofacial NF are presented in this case report.",
        "bt": "This case report presents three cases of cervicofacial NF."
    },
    {
        "source": "Dr. Smith diagnosed COVID-19 in New York.",
        "bt": "COVID-19 was diagnosed in New York by Dr. Johnson."
    }
]

print("✅ NER + Entity Consistency functions defined!\n")
print("🔍 Testing NER + Consistency Check:\n")

for i, pair in enumerate(test_pairs, 1):
    print(f"Example {i}:")
    print(f"  Source: {pair['source']}")
    print(f"  BT:     {pair['bt']}\n")

    source_ents = extract_ner_entities(pair['source'])
    bt_ents = extract_ner_entities(pair['bt'])

    consistency = check_entity_consistency(source_ents, bt_ents)

    print(f"  Source entities: {[e['text'] + ' (' + e['label'] + ')' for e in source_ents]}")
    print(f"  BT entities:     {[e['text'] + ' (' + e['label'] + ')' for e in bt_ents]}")
    print(f"  Consistent: {consistency['is_consistent']}")

    if consistency['missing_in_bt']:
        print(f"  ⚠️  Missing in BT: {consistency['missing_in_bt']}")
    if consistency['extra_in_bt']:
        print(f"  ⚠️  Extra in BT: {consistency['extra_in_bt']}")
    print()

✅ NER + Entity Consistency functions defined!

🔍 Testing NER + Consistency Check:

Example 1:
  Source: Three cases of cervicofacial NF are presented in this case report.
  BT:     This case report presents three cases of cervicofacial NF.

  Source entities: ['Three (CARDINAL)', 'NF (ORG)']
  BT entities:     ['three (CARDINAL)', 'NF (ORG)']
  Consistent: True

Example 2:
  Source: Dr. Smith diagnosed COVID-19 in New York.
  BT:     COVID-19 was diagnosed in New York by Dr. Johnson.

  Source entities: ['Smith (PERSON)', 'COVID-19 (PERSON)', 'New York (GPE)']
  BT entities:     ['COVID-19 (PERSON)', 'New York (GPE)', 'Johnson (PERSON)']
  Consistent: False
  ⚠️  Missing in BT: ['smith']
  ⚠️  Extra in BT: ['johnson']



#Binary Question Generation

In [10]:
# Binary Question Generation Prompt
binary_qg_prompt = """Task: You are an expert in semantic fact verification. You will be given an English sentence and a list of atomic facts derived from it.
Your goal is to convert each atomic fact into a strictly Boolean (Yes/No) question that verifies the information in the fact.
The question should be formulated such that the answer based on the atomic fact is "Yes".

Output the list of questions in a Python list format (e.g., ["Question 1?", "Question 2?"]) without any additional text or code blocks.

*** Example Starts ***
Sentence: It is not yet known whether the severity or level of control of underlying health conditions affects the risk for severe disease associated with COVID-19.
Atomic facts: ['It is not yet known whether the severity of underlying health conditions affects the risk for severe disease associated with COVID-19.', 'It is not yet known whether the level of control of underlying health conditions affects the risk for severe disease associated with COVID-19.']
Questions: ["Is it currently unknown whether the severity of underlying health conditions affects the risk for severe disease?", "Is it unknown whether the level of control of underlying health conditions affects the risk for severe disease?"]

Sentence: The number of accessory proteins and their function is unique depending on the specific coronavirus.
Atomic facts: ['The number of accessory proteins is unique depending on the specific coronavirus.', 'The function of accessory proteins is unique depending on the specific coronavirus.']
Questions: ["Is the number of accessory proteins unique depending on the specific coronavirus?", "Is the function of accessory proteins unique to the specific coronavirus?"]

Sentence: Patients were administered 5 mg of morphine intravenously.
Atomic facts: ['Patients were administered morphine.', 'The dosage of morphine was 5 mg.', 'The administration route was intravenous.']
Questions: ["Were the patients administered morphine?", "Was the dosage of morphine 5 mg?", "Was the morphine administered intravenously?"]
*** Example Ends ***

Sentence: {{sentence}}
Atomic facts: {{atomic_facts}}
Questions: """

# Binary QA Prompt (Yes/No answers)
binary_qa_prompt = """Task: You will be given an English sentence and a list of Yes/No questions. Your goal is to answer each question with either "Yes" or "No" based on the sentence.

Output only the list of answers in Python list format (e.g., ["Yes", "No", "Yes"]) without any additional explanation.

*** Example Starts ***
Sentence: Patients were administered 5 mg of morphine intravenously.
Questions: ["Were the patients administered morphine?", "Was the dosage of morphine 5 mg?", "Was the morphine administered intravenously?"]
Answers: ["Yes", "Yes", "Yes"]

Sentence: The virus is not contagious.
Questions: ["Is the virus contagious?"]
Answers: ["No"]
*** Example Ends ***

Sentence: {{sentence}}
Questions: {{questions}}
Answers: """

print("✅ Binary QG/QA prompts defined!")

# Test Binary QG
test_facts = [
    "Three cases of cervicofacial NF are presented in this case report.",
    "Patients were administered morphine.",
    "The virus is not contagious."
]

print("\n🔍 Testing Binary Question Generation:\n")

for fact in test_facts:
    prompt = binary_qg_prompt.replace("{{sentence}}", fact).replace("{{atomic_facts}}", str([fact]))
    result = generate_text_batch([prompt])
    questions = parse_list_output(result[0])

    print(f"Fact: {fact}")
    print(f"Binary Questions: {questions}")

    # Test Binary QA
    if questions:
        qa_prompt = binary_qa_prompt.replace("{{sentence}}", fact).replace("{{questions}}", str(questions))
        qa_result = generate_text_batch([qa_prompt])
        answers = parse_list_output(qa_result[0])
        print(f"Answers: {answers}")
    print()

✅ Binary QG/QA prompts defined!

🔍 Testing Binary Question Generation:



Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Fact: Three cases of cervicofacial NF are presented in this case report.
Binary Questions: ['Are three cases of cervicofacial NF presented in this case report?']


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Answers: ['Yes']



Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Fact: Patients were administered morphine.
Binary Questions: ['Were the patients administered morphine?']


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Answers: ['Yes']



Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Fact: The virus is not contagious.
Binary Questions: ['Is the virus not contagious?']


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Answers: ['Yes']



#Test on dataset

In [15]:
# Load dataset again (if needed)
dataset_subset = dataset[:20]  # Test with 20 samples
print(f"🔬 Running FULL PIPELINE with EXTENSION on {len(dataset_subset)} samples.\n")

results_with_extension = []

for i in range(0, len(dataset_subset), BATCH_SIZE):
    batch_entries = dataset_subset[i:i+BATCH_SIZE]
    current_ids = [e['id'] for e in batch_entries]
    print(f"⚙️  Processing Batch {i//BATCH_SIZE + 1} (IDs {current_ids[0]} to {current_ids[-1]})...")

    # Initialize containers
    batch_final_facts = [[] for _ in range(len(batch_entries))]
    batch_openie_triples = [[] for _ in range(len(batch_entries))]
    batch_dep_features = [[] for _ in range(len(batch_entries))]
    batch_ner_entities = [[] for _ in range(len(batch_entries))]
    batch_final_questions = [[] for _ in range(len(batch_entries))]
    batch_final_answers_src = [[] for _ in range(len(batch_entries))]
    batch_final_answers_bt = [[] for _ in range(len(batch_entries))]

    # --- Step 1: Atomic Fact Extraction ---
    print("  📝 Step 1: Extracting atomic facts...")
    prompts_facts = [atomic_fact_prompt_template.replace("{{sentence}}", e['source']) for e in batch_entries]
    facts_str_list = generate_text_batch(prompts_facts)
    raw_batch_facts = [parse_list_output(s) for s in facts_str_list]

    # --- Step 1.5: NLI Filtering ---
    print("  🔍 Step 1.5: NLI filtering...")
    flat_nli_inputs = []
    for idx, facts in enumerate(raw_batch_facts):
        if not facts: continue
        source = batch_entries[idx]['source']
        for fact in facts:
            text_input = f"{source} [SEP] {fact}"
            flat_nli_inputs.append({'batch_idx': idx, 'fact': fact, 'text_input': text_input})

    if flat_nli_inputs:
        nli_texts = [item['text_input'] for item in flat_nli_inputs]
        try:
            nli_results = nli_pipeline(nli_texts, batch_size=BATCH_SIZE, truncation=True, max_length=512)
            for item, res in zip(flat_nli_inputs, nli_results):
                label = res['label'].upper()
                if "CONTRADICTION" not in label:
                    batch_final_facts[item['batch_idx']].append(item['fact'])
        except Exception as e:
            print(f"  ⚠️  Error in NLI: {e}")
            for item in flat_nli_inputs:
                batch_final_facts[item['batch_idx']].append(item['fact'])
    else:
        batch_final_facts = raw_batch_facts

    # --- 🆕 EXTENSION PIPELINE ---
    print("  🔧 Extension: Extracting linguistic features...")

    for idx, entry in enumerate(batch_entries):
        source = entry['source']

        # OpenIE
        batch_openie_triples[idx] = extract_openie_triples(source)

        # Dependency Parsing
        batch_dep_features[idx] = extract_dependency_features(source)

        # NER
        batch_ner_entities[idx] = extract_ner_entities(source)

    # --- Step 2: Binary Question Generation ---
    print("  ❓ Step 2: Generating BINARY questions...")
    prompts_qg = []
    valid_indices_qg = []

    for idx, facts in enumerate(batch_final_facts):
        if not facts: continue

        # Use binary QG prompt instead of original
        prompt = binary_qg_prompt.replace("{{sentence}}", batch_entries[idx]['source']).replace("{{atomic_facts}}", str(facts))
        prompts_qg.append(prompt)
        valid_indices_qg.append(idx)

    if prompts_qg:
        qg_results_str = generate_text_batch(prompts_qg)
        for valid_idx, res_str in zip(valid_indices_qg, qg_results_str):
            batch_final_questions[valid_idx] = parse_list_output(res_str)

    # --- Step 3 & 4: Binary QA ---
    print("  💬 Step 3 & 4: Answering BINARY questions...")
    prompts_qa_src = []
    prompts_qa_bt = []
    valid_indices_qa = []

    for idx, questions in enumerate(batch_final_questions):
        if not questions: continue

        # Use binary QA prompt
        p_src = binary_qa_prompt.replace("{{sentence}}", batch_entries[idx]['source']).replace("{{questions}}", str(questions))
        prompts_qa_src.append(p_src)

        bt_sent = batch_entries[idx]['backtranslation']
        if bt_sent:
            p_bt = binary_qa_prompt.replace("{{sentence}}", bt_sent).replace("{{questions}}", str(questions))
            prompts_qa_bt.append(p_bt)
        else:
            prompts_qa_bt.append("No context provided.")

        valid_indices_qa.append(idx)

    if prompts_qa_src:
        combined_prompts = prompts_qa_src + prompts_qa_bt
        all_answers = generate_text_batch(combined_prompts)

        split_idx = len(prompts_qa_src)
        answers_src = all_answers[:split_idx]
        answers_bt = all_answers[split_idx:]

        for k, batch_idx in enumerate(valid_indices_qa):
            batch_final_answers_src[batch_idx] = parse_list_output(answers_src[k])
            batch_final_answers_bt[batch_idx] = parse_list_output(answers_bt[k])

    # --- FINAL SAVING with Extension Features ---
    for idx, entry in enumerate(batch_entries):
        results_with_extension.append({
            'id': entry['id'],
            'source': entry['source'],
            'backtranslation': entry.get('backtranslation', ""),
            'facts': batch_final_facts[idx],
            'openie_triples': batch_openie_triples[idx],
            'dependency_features': batch_dep_features[idx],
            'ner_entities': batch_ner_entities[idx],
            'binary_questions': batch_final_questions[idx],
            'answers_src': batch_final_answers_src[idx],
            'answers_bt': batch_final_answers_bt[idx],
            'mqm_score': entry.get('mqm_score', None),
            'severity': entry.get('severity', None)
        })

print("\n✅ FULL PIPELINE with EXTENSION complete!")
print(f"📊 Processed {len(results_with_extension)} samples.")
print(f"\n📄 First result with extension:")
print(json.dumps(results_with_extension[0], indent=2))

🔬 Running FULL PIPELINE with EXTENSION on 20 samples.

⚙️  Processing Batch 1 (IDs doc11 to doc11)...
  📝 Step 1: Extracting atomic facts...


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  🔍 Step 1.5: NLI filtering...
  🔧 Extension: Extracting linguistic features...
  ❓ Step 2: Generating BINARY questions...


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  💬 Step 3 & 4: Answering BINARY questions...


Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/20 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

⚙️  Processing Batch 2 (IDs doc11 to doc56)...
  📝 Step 1: Extracting atomic facts...


Adding requests:   0%|          | 0/10 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  🔍 Step 1.5: NLI filtering...
  🔧 Extension: Extracting linguistic features...
  ❓ Step 2: Generating BINARY questions...


Adding requests:   0%|          | 0/9 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/9 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  💬 Step 3 & 4: Answering BINARY questions...


Adding requests:   0%|          | 0/18 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/18 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


✅ FULL PIPELINE with EXTENSION complete!
📊 Processed 20 samples.

📄 First result with extension:
{
  "id": "doc11",
  "source": "Three cases of cervicofacial NF are presented in this case report.",
  "backtranslation": "This case report presents three cases of cervicofacial NF.",
  "facts": [
    "Three cases of cervicofacial NF are presented in this case report."
  ],
  "openie_triples": [],
  "dependency_features": {
    "negations": [],
    "modifiers": [
      {
        "modifier": "cervicofacial",
        "type": "ADJ",
        "modifies": "NF"
      }
    ],
    "has_negation": false
  },
  "ner_entities": [
    {
      "text": "Three",
      "label": "CARDINAL",
      "start": 0,
      "end": 5
    },
    {
      "text": "NF",
      "label": "ORG",
      "start": 29,
      "end": 31
    }
  ],
  "binary_questions": [
    "Are three cases of cervicofacial NF presented in this case report?"
  ],
  "answers_src": [
    "Yes"
  ],
  "answers_bt": [
    "Yes"
  ],
  "mqm_score": 95,

#Eval

In [16]:
# Calculate AskQE scores
print("📊 Calculating AskQE Scores...\n")

for result in results_with_extension:
    answers_src = result['answers_src']
    answers_bt = result['answers_bt']

    # Calculate exact match for binary answers
    min_len = min(len(answers_src), len(answers_bt))

    if min_len > 0:
        # Binary exact match
        matches = sum(1 for i in range(min_len) if answers_src[i] == answers_bt[i])
        result['askqe_score'] = matches / min_len
        result['num_questions'] = min_len
        result['num_matches'] = matches
    else:
        result['askqe_score'] = 1.0  # No questions = no errors
        result['num_questions'] = 0
        result['num_matches'] = 0

print("✅ Scores calculated!")

# Show statistics
import pandas as pd
df = pd.DataFrame(results_with_extension)

print("\n📈 Statistics by Severity:")
print(df.groupby('severity')['askqe_score'].agg(['mean', 'count']))

print("\n📋 Sample results:")
print(df[['id', 'severity', 'num_questions', 'num_matches', 'askqe_score']].head(10))

print(f"\n🎯 Overall AskQE Score: {df['askqe_score'].mean():.3f}")

📊 Calculating AskQE Scores...

✅ Scores calculated!

📈 Statistics by Severity:
          mean  count
severity             
Major      0.9      5
Minor      1.0      7
No Error   1.0      8

📋 Sample results:
      id  severity  num_questions  num_matches  askqe_score
0  doc11     Major              1            1          1.0
1  doc11     Minor              2            2          1.0
2  doc11     Major              2            2          1.0
3  doc11     Minor              3            3          1.0
4  doc11     Minor              3            3          1.0
5  doc11  No Error              2            2          1.0
6  doc11  No Error              2            2          1.0
7  doc11  No Error              3            3          1.0
8  doc11  No Error              4            4          1.0
9  doc11  No Error              4            4          1.0

🎯 Overall AskQE Score: 0.975


In [1]:
# Load MORE diverse samples
BIOMQM_FILE = f"{REPO_PATH}/biomqm/dev_with_backtranslation.jsonl"
dataset = load_biomqm_data(BIOMQM_FILE, limit=200)  # 200 samples

# Make sure we have diverse severities
print(f"\n📊 Dataset distribution:")
import pandas as pd
df_temp = pd.DataFrame(dataset)
print(df_temp['severity'].value_counts())
print(f"\nUnique IDs: {df_temp['id'].nunique()}")

NameError: name 'REPO_PATH' is not defined