# DistilBERT NER Training for Reservation Email Extraction

This notebook trains a Named Entity Recognition model using DistilBERT to extract reservation fields from travel agency emails.

## Model Overview
- **Base Model**: distilbert-base-cased
- **Task**: Token Classification (NER)
- **Entities**: 14 MAIL_* fields (FIRST_NAME, ARRIVAL, DEPARTURE, etc.)
- **Format**: BIO tagging (B-ENTITY, I-ENTITY, O)


## 🚀 Setup and Installation

In [None]:
# Install required packages
!pip install transformers datasets seqeval accelerate torch
!pip install wandb  # Optional: for experiment tracking

# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

## 📁 Upload Training Data

Upload your training data files generated from the BIO converter:
- `train_YYYYMMDD_HHMMSS.json`
- `val_YYYYMMDD_HHMMSS.json` 
- `test_YYYYMMDD_HHMMSS.json`
- `label_mapping.json`

In [None]:
from google.colab import files
import json
import os

# Create data directory
os.makedirs('ner_data', exist_ok=True)

print("Please upload your training data files:")
print("1. train_*.json")
print("2. val_*.json")
print("3. test_*.json")
print("4. label_mapping.json")

# Upload files
uploaded = files.upload()

# Move files to data directory
for filename in uploaded.keys():
    os.rename(filename, f'ner_data/{filename}')
    print(f"✅ Uploaded: {filename}")

## 🏷️ Load Data and Labels

In [None]:
import json
import glob
from pathlib import Path

# Find data files
data_dir = Path('ner_data')
train_file = list(data_dir.glob('train_*.json'))[0]
val_file = list(data_dir.glob('val_*.json'))[0]
test_file = list(data_dir.glob('test_*.json'))[0]
label_file = data_dir / 'label_mapping.json'

print(f"📂 Loading data files:")
print(f"  Train: {train_file}")
print(f"  Validation: {val_file}")
print(f"  Test: {test_file}")
print(f"  Labels: {label_file}")

# Load data
with open(train_file, 'r') as f:
    train_data = json.load(f)

with open(val_file, 'r') as f:
    val_data = json.load(f)

with open(test_file, 'r') as f:
    test_data = json.load(f)

with open(label_file, 'r') as f:
    label_mapping = json.load(f)

label2id = label_mapping['label2id']
id2label = {int(k): v for k, v in label_mapping['id2label'].items()}
num_labels = label_mapping['num_labels']

print(f"\n📊 Dataset sizes:")
print(f"  Train: {len(train_data)} records")
print(f"  Validation: {len(val_data)} records")
print(f"  Test: {len(test_data)} records")
print(f"  Labels: {num_labels} total ({len(label_mapping['entity_labels'])} entities + O)")

# Show sample data
print(f"\n📋 Sample training record:")
sample = train_data[0]
print(f"  Agency: {sample['agency']}")
print(f"  Tokens: {len(sample['tokens'])} ({sample['labeled_token_count']} labeled)")
print(f"  First 10 tokens: {sample['tokens'][:10]}")
print(f"  First 10 labels: {sample['labels'][:10]}")

## 🤗 Prepare HuggingFace Dataset

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import numpy as np

# Load tokenizer
MODEL_NAME = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print(f"🔤 Loaded tokenizer: {MODEL_NAME}")
print(f"  Vocab size: {tokenizer.vocab_size}")
print(f"  Max length: {tokenizer.model_max_length}")

def prepare_dataset(data_list):
    """Prepare dataset in HuggingFace format"""
    dataset_dict = {
        'id': [],
        'tokens': [],
        'labels': [],
        'agency': []
    }
    
    for record in data_list:
        # Convert labels to IDs
        label_ids = [label2id[label] for label in record['labels']]
        
        dataset_dict['id'].append(record['email_id'])
        dataset_dict['tokens'].append(record['tokens'])
        dataset_dict['labels'].append(label_ids)
        dataset_dict['agency'].append(record['agency'])
    
    return Dataset.from_dict(dataset_dict)

# Create datasets
train_dataset = prepare_dataset(train_data)
val_dataset = prepare_dataset(val_data)
test_dataset = prepare_dataset(test_data)

# Create DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(f"✅ Created HuggingFace datasets:")
print(dataset)

## 🔄 Tokenization and Label Alignment

In [None]:
def tokenize_and_align_labels(examples):
    """
    Tokenize text and align labels for subword tokens
    Key: Set label = -100 for non-first subword tokens to ignore in loss
    """
    tokenized_inputs = tokenizer(
        examples['tokens'], 
        truncation=True, 
        is_split_into_words=True,
        padding=False  # We'll pad in the data collator
    )
    
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            # Special tokens have a word id that is None
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)  # Handle edge cases
            # For other tokens in a word, set label to -100
            else:
                label_ids.append(-100)
            
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Apply tokenization
print("🔄 Tokenizing and aligning labels...")
tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset['train'].column_names
)

print("✅ Tokenization complete")
print(tokenized_dataset)

# Show tokenization example
sample_idx = 0
original_tokens = train_data[sample_idx]['tokens'][:10]
tokenized_tokens = tokenizer.convert_ids_to_tokens(tokenized_dataset['train'][sample_idx]['input_ids'][:20])
aligned_labels = tokenized_dataset['train'][sample_idx]['labels'][:20]

print(f"\n📋 Tokenization example:")
print(f"Original tokens: {original_tokens}")
print(f"Tokenized: {tokenized_tokens}")
print(f"Aligned labels: {aligned_labels}")

## 🏗️ Model Setup

In [None]:
from transformers import AutoModelForTokenClassification

# Load model
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

print(f"🤖 Loaded model: {MODEL_NAME}")
print(f"  Parameters: {model.num_parameters():,}")
print(f"  Labels: {model.num_labels}")

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"  Device: {device}")

## 📊 Evaluation Metrics

In [None]:
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
import numpy as np

def compute_metrics(eval_pred):
    """Compute entity-level F1 scores using seqeval"""
    predictions, labels = eval_pred
    
    # Get predicted class IDs
    predictions = np.argmax(predictions, axis=2)
    
    # Convert to label strings and remove ignored tokens
    true_predictions = []
    true_labels = []
    
    for prediction, label in zip(predictions, labels):
        true_pred = []
        true_lab = []
        
        for pred_id, label_id in zip(prediction, label):
            if label_id != -100:  # Ignore special tokens
                true_pred.append(id2label[pred_id])
                true_lab.append(id2label[label_id])
        
        true_predictions.append(true_pred)
        true_labels.append(true_lab)
    
    # Calculate metrics
    results = {
        'f1': f1_score(true_labels, true_predictions),
        'precision': precision_score(true_labels, true_predictions),
        'recall': recall_score(true_labels, true_predictions)
    }
    
    return results

print("✅ Evaluation metrics configured (seqeval for entity-level F1)")

## 🏃‍♂️ Training Configuration

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

# Training arguments
training_args = TrainingArguments(
    output_dir='./reservation-ner',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    warmup_steps=100,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    dataloader_pin_memory=False,
    report_to=None  # Disable wandb for now
)

# Data collator for token classification
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    return_tensors='pt'
)

print("⚙️ Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  FP16: {training_args.fp16}")
print(f"  Output dir: {training_args.output_dir}")

## 🚂 Initialize Trainer

In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("🏃‍♂️ Trainer initialized successfully")
print(f"  Training samples: {len(tokenized_dataset['train'])}")
print(f"  Validation samples: {len(tokenized_dataset['validation'])}")

## 🔥 Start Training

In [None]:
import time

print("🚀 Starting training...")
print("=" * 60)

start_time = time.time()

# Train the model
trainer.train()

end_time = time.time()
training_time = end_time - start_time

print("=" * 60)
print(f"🎉 Training completed!")
print(f"⏱️  Total training time: {training_time/3600:.2f} hours")

# Save the model
trainer.save_model('./reservation-ner-final')
tokenizer.save_pretrained('./reservation-ner-final')

print(f"💾 Model saved to: ./reservation-ner-final")

## 📈 Evaluate on Test Set

In [None]:
# Evaluate on test set
print("📊 Evaluating on test set...")

test_results = trainer.evaluate(tokenized_dataset['test'])

print("\n🎯 Test Results:")
print(f"  F1 Score: {test_results['eval_f1']:.4f}")
print(f"  Precision: {test_results['eval_precision']:.4f}")
print(f"  Recall: {test_results['eval_recall']:.4f}")
print(f"  Loss: {test_results['eval_loss']:.4f}")

# Get detailed classification report
predictions = trainer.predict(tokenized_dataset['test'])
y_pred = np.argmax(predictions.predictions, axis=2)
y_true = predictions.label_ids

# Convert to labels
true_predictions = []
true_labels = []

for prediction, label in zip(y_pred, y_true):
    true_pred = []
    true_lab = []
    
    for pred_id, label_id in zip(prediction, label):
        if label_id != -100:
            true_pred.append(id2label[pred_id])
            true_lab.append(id2label[label_id])
    
    true_predictions.append(true_pred)
    true_labels.append(true_lab)

# Print detailed report
print("\n📋 Detailed Classification Report:")
print(classification_report(true_labels, true_predictions))

## 🧪 Test Model Inference

In [None]:
from transformers import pipeline

# Create inference pipeline
ner_pipeline = pipeline(
    "ner", 
    model='./reservation-ner-final',
    tokenizer='./reservation-ner-final',
    aggregation_strategy="simple"
)

# Test with sample text
sample_text = """
Guest Name: JOHN SMITH
Arrive: 25/08/2025
Depart: 27/08/2025
Total Nights: 2 nights
Adult/Children: 2/0
Room Type: Superior Room
Rate Code: WH04199R
Total charges: AED 500.00
Travel Agent: AGODA COMPANY PTE. LTD.
"""

print("🧪 Testing model inference:")
print(f"Input text: {sample_text}")
print("\n🎯 Extracted entities:")

# Run inference
entities = ner_pipeline(sample_text)

for entity in entities:
    print(f"  {entity['entity_group']}: {entity['word']} (confidence: {entity['score']:.3f})")

# Test with another sample if available
if len(test_data) > 0:
    print("\n" + "="*50)
    print("🧪 Testing with actual test sample:")
    
    test_sample = test_data[0]
    test_text = " ".join(test_sample['tokens'][:200])  # First 200 tokens
    
    print(f"Agency: {test_sample['agency']}")
    print(f"Text preview: {test_text[:200]}...")
    
    entities = ner_pipeline(test_text)
    print("\n🎯 Extracted entities:")
    for entity in entities:
        print(f"  {entity['entity_group']}: {entity['word']} (confidence: {entity['score']:.3f})")

## 📦 Download Trained Model

In [None]:
import shutil
from google.colab import files
import os

# Create zip file with model
model_name = f"reservation-ner-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
shutil.make_archive(model_name, 'zip', './reservation-ner-final')

print(f"📦 Created model archive: {model_name}.zip")

# Save training results
results_summary = {
    'model_name': model_name,
    'base_model': MODEL_NAME,
    'training_data': {
        'train_samples': len(train_data),
        'val_samples': len(val_data),
        'test_samples': len(test_data)
    },
    'test_results': test_results,
    'training_time_hours': training_time/3600,
    'labels': label_mapping['entity_labels'],
    'training_config': {
        'epochs': training_args.num_train_epochs,
        'batch_size': training_args.per_device_train_batch_size,
        'learning_rate': training_args.learning_rate
    }
}

with open(f'{model_name}_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"\n📊 Training Summary:")
print(f"  Model: {MODEL_NAME}")
print(f"  Test F1: {test_results['eval_f1']:.4f}")
print(f"  Training time: {training_time/3600:.2f} hours")
print(f"  Total parameters: {model.num_parameters():,}")

# Download files
print("\n⬇️ Download your trained model:")
files.download(f'{model_name}.zip')
files.download(f'{model_name}_results.json')

print("\n✅ Download complete!")
print("\nNext steps:")
print("1. Extract the model ZIP file locally")
print("2. Integrate with your existing parsing pipeline")
print("3. Test on new reservation emails")
print("4. Set up confidence thresholding for production")

## 🔧 Production Integration Code

Copy this code to integrate the trained model into your local system:

In [None]:
integration_code = '''
# Production Integration Code
# Add this to your local system

from transformers import pipeline
import re

class ReservationNERExtractor:
    def __init__(self, model_path: str, confidence_threshold: float = 0.8):
        """Initialize NER model for reservation extraction"""
        self.ner_pipeline = pipeline(
            "ner",
            model=model_path,
            tokenizer=model_path,
            aggregation_strategy="simple"
        )
        self.confidence_threshold = confidence_threshold
    
    def extract_entities(self, email_text: str) -> dict:
        """Extract entities from email text"""
        entities = self.ner_pipeline(email_text)
        
        # Filter by confidence and organize by entity type
        extracted_fields = {}
        low_confidence_fields = {}
        
        for entity in entities:
            entity_type = entity['entity_group']
            value = entity['word'].strip()
            confidence = entity['score']
            
            if confidence >= self.confidence_threshold:
                extracted_fields[entity_type] = value
            else:
                low_confidence_fields[entity_type] = {
                    'value': value,
                    'confidence': confidence
                }
        
        return {
            'extracted_fields': extracted_fields,
            'low_confidence_fields': low_confidence_fields,
            'needs_review': len(low_confidence_fields) > 0
        }

# Usage example:
# extractor = ReservationNERExtractor('./reservation-ner-model')
# result = extractor.extract_entities(email_content)
# print(result)
'''

print("💻 Production integration code:")
print(integration_code)

# Save integration code
with open('production_integration.py', 'w') as f:
    f.write(integration_code)

files.download('production_integration.py')