In [None]:
!pip install -U bitsandbytes accelerate transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-5.1.0-py3-none-any.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Successfully uninstalled transformers-5.0.0
Successfully installed bitsandbytes-0.49.1 transformers-5.1.0


In [None]:
import torch
import pandas as pd
import re
import random
import json

In [None]:
# Configuration
number_of_data = 3

In [None]:
from transformers import (
    BitsAndBytesConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig
)

llm_name = "Qwen/Qwen3-4B-Instruct-2507"

quantization_config = BitsAndBytesConfig(
    load_in_8bit=False, load_in_4bit=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(llm_name, padding_side="left")

# Prevent some transformers specific issues.
tokenizer.use_default_system_prompt = False
tokenizer.pad_token_id = tokenizer.eos_token_id

# Load LLM.
llm = AutoModelForCausalLM.from_pretrained(
    llm_name,
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

# Set LLM on eval mode.
llm.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]



vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/398 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear4bit(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear4bit(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
 

In [None]:
# Set up our generation configuration.
# We disable sampling to ensure reproducibility (we may lose some accuracy).
generation_config = GenerationConfig(
  max_new_tokens = 2048,
  do_sample=False,
  eos_token_id=tokenizer.eos_token_id,
  pad_token_id=tokenizer.pad_token_id,
)

In [None]:
# Generate prompt
def generate(prompt, llm=llm, generation_config=generation_config):

    turns = [{'role': 'user', 'content': prompt}]

    inputs = tokenizer.apply_chat_template(
        turns,
        add_generation_prompt=True,
        return_tensors='pt'
    )

    inputs = {k: v.to(llm.device) for k, v in inputs.items()}
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = llm.generate(
            input_ids=input_ids,
            generation_config=generation_config
        )

    answer_tokens = outputs[0, input_ids.shape[1]:-1]
    return tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()


def parse_answer(answer):
  labels = ["Cell_line", "Protein", "RNA", "DNA", "Cell_type"]

  #create pattern
  pattern = r"(" + "|".join(labels) + ")"

  matches = re.findall(r"(Cell_line|Protein|RNA, DNA|Cell_type)", answer)
  return matches[0] if len(matches) > 0 else "none"


In [None]:
data = []
with open("dev.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))

# Convert the list of dicts into a pandas DataFrame
train_data = pd.DataFrame(data)
print(train_data.shape)

(1669, 2)


In [None]:
sample = train_data.head(number_of_data)
print(sample)

                                                text  \
0  The inhibition of c-fos and c-jun expression b...   
1  IL-4 did not affect the stability of the c-fos...   
2  Finally, using electrophoretic mobility shift ...   

                                            entities  
0  [{'text': 'c-jun genes', 'type': 'DNA', 'span'...  
1  [{'text': 'c-fos', 'type': 'Protein', 'span': ...  
2  [{'text': 'AP-1', 'type': 'Protein', 'span': [...  


#Auto-update the guideline

Guideline template:


Your task is to recognize Entity inside the  following text.
Use the guideline that we provide


Guidelines:

```md
- Entity types: Cell_line, Protein, RNA, DNA, Cell_type.
```

Text: Induction of NF-KB during monocyte differentiation by HIV type 1 infection.


In [None]:
def clean_actual(ner_list):
    """Converts list of dicts to sorted string of tuples: ('text', 'type')"""
    if not isinstance(ner_list, list):
        return str(ner_list)
    pairs = sorted([(item['text'], item['type']) for item in ner_list])

    return ", ".join([f"('{t}', '{l}')" for t, l in pairs])

def clean_model_output(output_str):
    matches = re.findall(r"\('([^']*)', '([^']*)'\)", output_str)

    if matches:
        sorted_matches = sorted(matches)
        return ", ".join([str(match) for match in sorted_matches])
    return output_str

In [None]:
current_guideline = """
```md
- Entity types: Cell_line, Protein, RNA, DNA, Cell_type.
```
"""

In [None]:
def extract_md_blocks(text):
  blocks = re.findall(r"```(?:md)?\s*(.*?)\n```", text, re.DOTALL)
  return "```md\n" + "\n".join(blocks) + "\n```" if blocks else ""

guideline_history = [current_guideline]
example_history = []

def process_prompt(current_guideline, sample, is_random_no_example, num_examples = 5):
  for e in sample.itertuples():
      print("Guideline: ")
      print(current_guideline)
      print("##########")
      # ------- Construct example --------- #
      if (is_random_no_example):
        num_examples = random.randint(1, 10)
      random_examples = train_data[train_data.index != e.Index].sample(n=num_examples)
      print(num_examples)

      example_str = ""
      for ex in random_examples.itertuples():
          example_str += f"Text: {ex.text}\nEntities: {clean_actual(ex.entities)}\n\n"
      example_history.append(example_str)

      prompt = f"""Below is a work-in-progress NER guideline. I will then provide an annotated examples.
      Your task is to update and expand the guideline based on what can be learned from examples.

      Guidelines should remain concise.

      Guidelines::\n{current_guideline}\n\nExamples:

      {example_str}

      """
      model_answer = generate(prompt)
      print("model answer: ")
      print(model_answer)

      # Apply to strategy: If model_answer has new info, add it
      current_guideline = extract_md_blocks(model_answer)
      # Save the new version
      guideline_history.append(current_guideline)





In [None]:
process_prompt(current_guideline, sample=sample, is_random_no_example=True, num_examples = 5)

Guideline: 

```md
- Entity types: Cell_line, Protein, RNA, DNA, Cell_type.
```

##########
model answer: 
```md
- Entity types: Cell_line, Protein, RNA, DNA, Cell_type.

- Rules:
  - 'Protein': Named entities representing proteins, often with a prefix (e.g., NF-kappa B, GATA-1, IL-2R), or when a gene is described as being "activated" or "regulated" by a factor, the protein is inferred if the name is canonical.
  - 'DNA': Gene names (e.g., GATA-1 gene), cDNAs, or sequences described as "genes", "gene products", or "dependent genes" — especially when associated with regulation or expression.
  - 'RNA': Terms like 'cDNAs', 'mRNAs', 'RNA', or references to transcripts, especially when explicitly mentioned as such (e.g., "cDNAs were cloned").
  - 'Cell_line': Entities derived from a lineage, often named with a compound modifier (e.g., "ethyl-methanesulfonate-derived subclones", "parental line", "subclones").
  - 'Cell_type': Cell populations or states described by functional or development

In [None]:
# print guideline history
for g in guideline_history:
  print(g)


```md
- Entity types: Cell_line, Protein, RNA, DNA, Cell_type.
```

```md
- Entity types: Cell_line, Protein, RNA, DNA, Cell_type.

- Rules:
  - 'Protein': Named entities representing proteins, often with a prefix (e.g., NF-kappa B, GATA-1, IL-2R), or when a gene is described as being "activated" or "regulated" by a factor, the protein is inferred if the name is canonical.
  - 'DNA': Gene names (e.g., GATA-1 gene), cDNAs, or sequences described as "genes", "gene products", or "dependent genes" — especially when associated with regulation or expression.
  - 'RNA': Terms like 'cDNAs', 'mRNAs', 'RNA', or references to transcripts, especially when explicitly mentioned as such (e.g., "cDNAs were cloned").
  - 'Cell_line': Entities derived from a lineage, often named with a compound modifier (e.g., "ethyl-methanesulfonate-derived subclones", "parental line", "subclones").
  - 'Cell_type': Cell populations or states described by functional or developmental state (e.g., "quiescent progenitors