# üåæ AgriBot NER Training with PhoBERT (IoT Focus)

Train PhoBERT-based NER model for Vietnamese Agricultural IoT Chatbot

**Entity Types (6 types):**
- `DATE`: th√°ng n√†y, qu√Ω 1, nƒÉm nay, th√°ng 11, ...
- `CROP`: cam s√†nh, l√∫a ST25, xo√†i c√°t chu, ...
- `AREA`: khu A, khu B, khu 1, ...
- `DURATION`: 5 ph√∫t, 10 ph√∫t, 1 gi·ªù, ...
- `DEVICE`: m√°y b∆°m, ƒë√®n, t∆∞·ªõi, b∆°m, ...
- `METRIC`: nhi·ªát ƒë·ªô, ƒë·ªô ·∫©m, √°nh s√°ng, ...

## üì¶ Step 1: Install Dependencies

In [None]:
!pip install transformers datasets torch scikit-learn seqeval pandas -q

## üìä Step 2: Load Training Data

**Upload CSV file** generated by `generate_ner_data_v2.py`

In [None]:
import json
import pandas as pd
from typing import List, Dict
from google.colab import files

# Upload CSV file
print("üì§ Please upload your CSV file (generated by generate_ner_data_v2.py)")
uploaded = files.upload()

# Get the uploaded filename
csv_filename = list(uploaded.keys())[0]
print(f"\nüìÇ Loading data from {csv_filename}...")

# Load CSV
df = pd.read_csv(csv_filename)

# Convert to training format: [(text, [(start, end, label), ...]), ...]
training_data = []

for _, row in df.iterrows():
    text = row['text']
    entities_json = json.loads(row['entities'])
    # Convert to (start, end, type) tuples
    entities = [(e['start'], e['end'], e['type']) for e in entities_json]
    training_data.append((text, entities))

print(f"\n‚úÖ Loaded {len(training_data)} training examples from CSV")

# Show sample
print("\nüìù Sample data:")
for i, (text, entities) in enumerate(training_data[:5]):
    print(f"\n{i+1}. Text: {text}")
    print(f"   Entities: {entities}")

## üîÑ Step 3: Convert to BIO Format

In [None]:
def convert_to_bio_format(data: List[tuple]) -> List[Dict]:
    """
    Convert annotated data to BIO format
    """
    bio_data = []
    
    for text, entities in data:
        # Tokenize by word
        words = text.split()
        labels = ['O'] * len(words)
        
        # Create character to word index mapping
        char_to_word = {}
        current_pos = 0
        for word_idx, word in enumerate(words):
            word_start = text.find(word, current_pos)
            word_end = word_start + len(word)
            for char_idx in range(word_start, word_end):
                char_to_word[char_idx] = word_idx
            current_pos = word_end
        
        # Assign BIO labels
        for start, end, entity_type in entities:
            # Find words that overlap with entity span
            entity_words = set()
            for char_idx in range(start, end):
                if char_idx in char_to_word:
                    entity_words.add(char_to_word[char_idx])
            
            entity_words = sorted(entity_words)
            if entity_words:
                # First word gets B- tag
                labels[entity_words[0]] = f"B-{entity_type}"
                # Remaining words get I- tag
                for word_idx in entity_words[1:]:
                    labels[word_idx] = f"I-{entity_type}"
        
        bio_data.append({
            "tokens": words,
            "ner_tags": labels
        })
    
    return bio_data

bio_dataset = convert_to_bio_format(training_data)

# Display first example
print("\nüìù Example BIO format:")
example = bio_dataset[0]
for token, tag in zip(example['tokens'], example['ner_tags']):
    print(f"{token:20} ‚Üí {tag}")

print(f"\n‚úÖ Converted {len(bio_dataset)} examples to BIO format")

## üè∑Ô∏è Step 4: Create Label Mapping

In [None]:
# Extract all unique labels
all_labels = set()
for example in bio_dataset:
    all_labels.update(example['ner_tags'])

# Sort labels (O first, then B- tags, then I- tags)
label_list = sorted(all_labels, key=lambda x: (x != 'O', x))

# Create label mappings
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"\nüè∑Ô∏è Total labels: {len(label_list)}")
print("\nLabel mapping:")
for label, idx in label2id.items():
    print(f"{idx:2d}: {label}")

# Save label mapping
label_mapping = {
    "label_to_id": label2id,
    "id_to_label": id2label,
    "entity_types": list(set([label.split('-')[1] for label in label_list if '-' in label]))
}

with open('label_mapping.json', 'w', encoding='utf-8') as f:
    json.dump(label_mapping, f, ensure_ascii=False, indent=2)

print("\n‚úÖ Saved label_mapping.json")

## üìö Step 5: Prepare Dataset for Training

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Convert to HuggingFace Dataset format
def prepare_dataset(bio_data, label2id):
    dataset_dict = {
        "tokens": [],
        "ner_tags": []
    }
    
    for example in bio_data:
        dataset_dict["tokens"].append(example["tokens"])
        # Convert labels to IDs
        tag_ids = [label2id[tag] for tag in example["ner_tags"]]
        dataset_dict["ner_tags"].append(tag_ids)
    
    return Dataset.from_dict(dataset_dict)

# Split train/validation (80/20)
train_data, val_data = train_test_split(bio_dataset, test_size=0.2, random_state=42)

train_dataset = prepare_dataset(train_data, label2id)
val_dataset = prepare_dataset(val_data, label2id)

print(f"\nüìä Dataset split:")
print(f"  Training: {len(train_dataset)} examples")
print(f"  Validation: {len(val_dataset)} examples")

## ü§ñ Step 6: Load PhoBERT Model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

model_name = "vinai/phobert-base"
num_labels = len(label_list)

print(f"Loading {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"‚úÖ Model loaded on {device}")
print(f"   Number of labels: {num_labels}")

## üîß Step 7: Tokenize Dataset

In [None]:
"""
Step 7
PhoBERT tokenizer kh√¥ng support word_ids(), c·∫ßn manual alignment
"""

def tokenize_and_align_labels(examples):
    """
    Tokenize text and align NER labels with subword tokens
    PhoBERT tokenizer doesn't support word_ids(), so we do manual alignment
    """
    tokenized_inputs = {
        "input_ids": [],
        "attention_mask": [],
        "labels": []
    }
    
    for tokens, ner_tags in zip(examples["tokens"], examples["ner_tags"]):
        # Join tokens back to text
        text = " ".join(tokens)
        
        # Tokenize the full text
        encoding = tokenizer(
            text,
            truncation=True,
            max_length=128,
            padding="max_length",
            return_tensors=None
        )
        
        # Get token IDs
        token_ids = encoding["input_ids"]
        attention_mask = encoding["attention_mask"]
        
        # Initialize labels with -100 (ignore index)
        labels = [-100] * len(token_ids)
        
        # Manual alignment: match each word to its tokens
        current_pos = 0
        word_idx = 0
        
        for i, token_id in enumerate(token_ids):
            # Skip special tokens
            if token_id in [tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id]:
                continue
            
            # Decode token
            token_text = tokenizer.decode([token_id], skip_special_tokens=True).strip()
            
            # Remove PhoBERT underscore prefix
            token_clean = token_text.replace("_", " ").strip()
            
            if not token_clean:
                continue
            
            # Try to match this token to a word
            if word_idx < len(tokens):
                word = tokens[word_idx]
                
                # Check if this token is part of the current word
                if token_clean.lower() in word.lower() or word.lower().startswith(token_clean.lower()):
                    # Assign the label for this word
                    labels[i] = ner_tags[word_idx]
                    
                    # Check if we've finished this word
                    if token_clean.lower() == word.lower():
                        word_idx += 1
                else:
                    # Move to next word
                    word_idx += 1
                    if word_idx < len(tokens):
                        labels[i] = ner_tags[word_idx]
        
        tokenized_inputs["input_ids"].append(encoding["input_ids"])
        tokenized_inputs["attention_mask"].append(encoding["attention_mask"])
        tokenized_inputs["labels"].append(labels)
    
    return tokenized_inputs

# Tokenize datasets
print("Tokenizing training dataset...")
tokenized_train = train_dataset.map(
    tokenize_and_align_labels, 
    batched=True,
    remove_columns=train_dataset.column_names
)

print("Tokenizing validation dataset...")
tokenized_val = val_dataset.map(
    tokenize_and_align_labels, 
    batched=True,
    remove_columns=val_dataset.column_names
)

print("‚úÖ Datasets tokenized")
print(f"   Training samples: {len(tokenized_train)}")
print(f"   Validation samples: {len(tokenized_val)}")


## üéØ Step 8: Define Training Arguments

In [None]:
"""
Step 8 (thay th·∫ø TrainingArguments)
Disable wandb logging
"""

from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score
import os

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Metric computation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

# Training arguments (with wandb disabled)
training_args = TrainingArguments(
    output_dir="./ner_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    report_to="none",  # Disable all reporting (wandb, tensorboard, etc.)
)

print("‚úÖ Training arguments configured")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Learning rate: {training_args.learning_rate}")
print(f"   Logging: Disabled (no wandb)")

## üöÄ Step 9: Train Model

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
print("\nüöÄ Starting training...\n")
trainer.train()

print("\n‚úÖ Training completed!")

## üìä Step 10: Evaluate Model

In [None]:
# Evaluate on validation set
results = trainer.evaluate()

print("\nüìä Evaluation Results:")
for key, value in results.items():
    print(f"  {key}: {value:.4f}")

## üíæ Step 11: Save Model

In [None]:
# Save model and tokenizer
output_dir = "./ner_extractor_final"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

# Copy label mapping
import shutil
shutil.copy('label_mapping.json', f'{output_dir}/label_mapping.json')

print(f"\n‚úÖ Model saved to {output_dir}")
print("\nüì¶ Files to download:")
print("  - config.json")
print("  - pytorch_model.bin (or model.safetensors)")
print("  - label_mapping.json")

## üß™ Step 12: Test Model

In [None]:
# Test on new examples
test_examples = [
    "B·∫≠t t∆∞·ªõi khu A trong 5 ph√∫t",
    "ƒê·ªô ·∫©m ·ªü khu B l√† bao nhi√™u",
    "Chi ph√≠ th√°ng n√†y",
    "T·∫Øt ƒë√®n khu C",
    "Nhi·ªát ƒë·ªô khu 1 hi·ªán t·∫°i",
    "Doanh thu qu√Ω 2",
    "C√°ch tr·ªìng cam s√†nh"
]

def predict_entities(text):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)[0]
    
    # Decode
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = [id2label[p.item()] for p in predictions]
    
    # Extract entities
    entities = []
    current_entity = None
    
    for token, label in zip(tokens, labels):
        if token in ["<s>", "</s>", "<pad>"]:
            continue
            
        if label.startswith("B-"):
            if current_entity:
                entities.append(current_entity)
            current_entity = {"type": label[2:], "text": token.replace("_", " ").strip()}
        elif label.startswith("I-") and current_entity:
            current_entity["text"] += " " + token.replace("_", "").strip()
        elif label == "O" and current_entity:
            entities.append(current_entity)
            current_entity = None
    
    if current_entity:
        entities.append(current_entity)
    
    return entities

print("\nüß™ Testing model on new examples:\n")
for example in test_examples:
    entities = predict_entities(example)
    print(f"Text: {example}")
    print(f"Entities: {entities}")
    print()

## üì• Step 13: Download Model Files

In [None]:
# Zip model files for easy download
!zip -r ner_model.zip ner_extractor_final/
print("‚úÖ Model zipped as ner_model.zip")
print("\nüì• Download ner_model.zip from Colab Files panel")
print("\nüìã Deployment instructions:")
print("1. Extract ner_model.zip")
print("2. Copy files to: C:\\Users\\ADMIN\\Desktop\\ex\\apps\\python-ai-service\\models\\ner_extractor\\")
print("3. Restart Python AI service")