In [None]:
# Import grobid
!wget https://github.com/kermitt2/grobid/archive/0.8.2.zip
!unzip 0.8.2.zip -d /content/polymer_nlp_extractor/workspace
!rm 0.8.2.zip

In [None]:
# add root directory of the project to python path
import sys
sys.path.append('/content/polymer_nlp_extractor')
PROJECT_ROOT = '/content/polymer_nlp_extractor'

In [None]:
# install dependencies with logs
! pip install -r {PROJECT_ROOT}/requirements.txt

In [None]:
# Reset variable
RESET_BACKEND = False

from polymer_extractor.services.setup_service import SetupService
from fastapi import HTTPException

setup_service = SetupService()

if RESET_BACKEND:
    try:
        # Get database and bucket managers from setup service
        db_manager = setup_service.get_database_manager()
        bucket_manager = setup_service.get_bucket_manager()
        
        # Delete collections
        collections = db_manager.list_collections()
        for col in collections:
            if col['$id'] != "system_logs":  # Skip Logger's autonomous collection
                db_manager.delete_collection(col['$id'])

        # Delete buckets
        buckets = bucket_manager.list_buckets()
        for bkt in buckets:
            bucket_manager.delete_bucket(bkt['$id'])

        # Reinitialize
        setup_service.initialize_all_resources()
        print("System reset and reinitialized successfully.")
        result = {"status": "success", "message": "System reset and reinitialized successfully."}
        print(f"Reset completed: {result['message']}")
    except Exception as e:
        error_msg = f"Failed to reset system: {str(e)}"
        print(f"ERROR: Failed to reset system: {str(e)}")
        print(f"Reset failed: {error_msg}")
        raise HTTPException(status_code=500, detail=error_msg)
else:
    try:
        setup_service.initialize_all_resources()
        print("System initialized successfully.")
        print("System initialization completed successfully.")
    except Exception as e:
        error_msg = f"Failed to initialize system: {str(e)}"
        print(f"ERROR: Failed to initialize system: {str(e)}")
        print(f"Initialization failed: {error_msg}")
        raise HTTPException(status_code=500, detail=error_msg)


In [None]:
import os
from polymer_extractor.services import grobid_service
from polymer_extractor.services.grobid_service import GrobidService
from pydantic import BaseModel

# === Response Models ===

class ServerStatusResponse(BaseModel):
    """Response model for server status checks."""
    status: str
    message: str
    server_url: str

# Define workspace directory if not already defined
WORKSPACE_DIR = os.path.join(PROJECT_ROOT, "workspace")

# Initialize GrobidService
grobid_service = GrobidService()

# Check if Grobid is running
try:
    grobid_service.check_server_status()
    grobid_is_running = True
    grobid_status_response = ServerStatusResponse(
        status="running",
        message="GROBID server is alive and responding",
        server_url=grobid_service.grobid_server_url
    )
    print("GROBID server is alive and responding")
except Exception as e:
    grobid_is_running = False
    print(f"WARNING: GROBID server status check failed: {e}")
    grobid_status_response = ServerStatusResponse(
        status="unreachable",
        message=f"GROBID server is not responding: {str(e)}",
        server_url=grobid_service.grobid_server_url
    )

# If Grobid is not running, start the server
if not grobid_is_running:
    try:
        grobid_home = os.path.join(WORKSPACE_DIR, "grobid-0.8.2")
        print("Starting GROBID server via API request")
        grobid_service.start_server(grobid_home=grobid_home)
        grobid_status_response = ServerStatusResponse(
            status="started",
            message="GROBID server started successfully",
            server_url=grobid_service.grobid_server_url
        )
        print("GROBID server started successfully")
    except Exception as e:
        print(f"ERROR: Failed to start GROBID server via API: {e}")
        raise Exception(f"Failed to start GROBID server: {str(e)}")


In [None]:
# Process a File

from pathlib import Path
import os
import tempfile
import time
from fastapi import HTTPException
from pydantic import BaseModel, Field
import asyncio

# Request models
class TokenPackRequest(BaseModel):
    """
    Request body for /api/preprocess/tokenpack
    """
    tei_path: str = Field(
        ...,
        description="Absolute filesystem path to cleaned TEI XML file."
    )

class TEIProcessRequest(BaseModel):
    """
    Request body for /api/preprocess/tei
    """
    tei_path: str = Field(
        ...,
        description="Absolute filesystem path to the TEI XML file for cleaning and metadata extraction."
    )

class ProcessingResult(BaseModel):
    success: bool
    message: str
    original_file: str
    pdf_file: str = None
    metadata: dict = {}
    local_tei_path: str = None
    storage_success: bool = False
    storage_errors: list = []

# Mock services and functions - replace with actual implementations
class MockFile:
    def __init__(self, filepath):
        self.filename = os.path.basename(filepath)
        self.filepath = filepath
    
    async def read(self):
        with open(self.filepath, 'rb') as f:
            return f.read()

class MockGrobidService:
    def process_document(self, temp_path, filename_stem, original_filename):
        return {
            'pdf_file': str(temp_path),
            'metadata': {'title': 'Test Document'},
            'local_tei_path': str(temp_path).replace('.pdf', '.xml'),
            'storage_success': True,
            'storage_errors': []
        }

class MockTEIProcessingService:
    def process(self, tei_path):
        return {"success": True, "processed_path": tei_path.replace('.xml', '_processed.xml')}

class MockTokenizerService:
    def audit_and_extend_all(self, force=False):
        return {"tokenizers_updated": 3, "force_rebuild": force}

class MockTokenPackingService:
    def process(self, tei_path):
        return {"success": True, "packed_path": tei_path.replace('.xml', '_packed.json')}

def cleanup_temp_file(temp_path):
    if temp_path and os.path.exists(temp_path):
        os.remove(temp_path)

# Initialize services and variables
WORKSPACE_DIR = os.path.join(WORKSPACE_DIR, "workspace")
grobid_service = MockGrobidService()

async def process_file_pipeline():
    """Main processing pipeline for a file"""
    
    file_path = os.path.join(WORKSPACE_DIR, "raw_inputs", "057.pdf")  # Example file path
    
    # Check if file exists
    if not os.path.exists(file_path):
        raise HTTPException(status_code=404, detail=f"File not found: {file_path}")
    
    file = MockFile(file_path)
    
    # Step 1: Upload a file via grobid and retain its path for step 2
    temp_path = None
    try:
        allowed_extensions = {'.pdf', '.xml', '.html', '.htm'}
        file_ext = Path(file.filename).suffix.lower()
        if file_ext not in allowed_extensions:
            raise HTTPException(
                status_code=400,
                detail=f"Unsupported file type: {file_ext}. Allowed: {', '.join(allowed_extensions)}"
            )

        original_stem = Path(file.filename).stem
        temp_dir = Path(tempfile.gettempdir())
        temp_filename = f"{original_stem}_{int(time.time())}{file_ext}"
        temp_path = temp_dir / temp_filename

        with open(temp_path, 'wb') as temp_file:
            content = await file.read()
            temp_file.write(content)

        result = grobid_service.process_document(
            temp_path,
            filename_stem=original_stem,
            original_filename=file.filename
        )

        grobid_result = ProcessingResult(
            success=True,
            message=f"Successfully processed {file.filename}",
            original_file=file.filename,
            pdf_file=result.get('pdf_file'),
            metadata=result.get('metadata', {}),
            local_tei_path=result.get('local_tei_path'),
            storage_success=result.get('storage_success', False),
            storage_errors=result.get('storage_errors', [])
        )
        
        tei_path = result.get('local_tei_path')

    except HTTPException:
        raise
    except Exception as e:
        print(f"ERROR: Processing failed for uploaded file - {e}")
        if temp_path and temp_path.exists():
            cleanup_temp_file(temp_path)
        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")

    # Step 2: Take the grobid output path as input and perform processing of the xml
    if not tei_path:
        raise HTTPException(status_code=500, detail="No TEI path from grobid processing")
    
    req_tei = TEIProcessRequest(tei_path=tei_path)
    
    print(f"INFO: Received TEI processing request: tei_path={req_tei.tei_path}")

    if not os.path.isabs(req_tei.tei_path) or not os.path.exists(req_tei.tei_path):
        print(f"ERROR: TEI file not found: {req_tei.tei_path}")
        raise HTTPException(status_code=404, detail=f"TEI file not found: {req_tei.tei_path}")

    try:
        service = MockTEIProcessingService()
        tei_result = service.process(req_tei.tei_path)
        processed_tei_path = tei_result.get('processed_path', req_tei.tei_path)
        
        print(f"INFO: TEI processing completed for {req_tei.tei_path}")
    except Exception as e:
        print(f"ERROR: TEI processing failed: {e}")
        raise HTTPException(status_code=500, detail=f"TEI processing failed: {str(e)}")

    # Step 3: Attempt tokenization 
    force = False  # Define force parameter
    print(f"INFO: Received tokenizer audit request with force={force}")
    try:
        tokenizer_service = MockTokenizerService()
        audit_results = tokenizer_service.audit_and_extend_all(force=force)

        print("INFO: Tokenizer audit completed successfully.")
        
        tokenizer_result = {
            "success": True,
            "message": "All tokenizers audited and extended successfully.",
            "force_rebuild": force,
            "audit_results": audit_results
        }
    except Exception as e:
        print(f"ERROR: Tokenizer audit failed: {e}")
        raise HTTPException(status_code=500, detail=f"Tokenizer audit failed: {str(e)}")

    # Step 4: Take the output path of the xml processing in step 2 and use it for token packing
    req_token = TokenPackRequest(tei_path=processed_tei_path)
    
    print(f"INFO: Received token packing request: tei_path='{req_token.tei_path}'")

    if not os.path.isfile(req_token.tei_path):
        print(f"ERROR: TEI file does not exist: {req_token.tei_path}")
        raise HTTPException(status_code=404, detail=f"TEI file not found: {req_token.tei_path}")

    try:
        service = MockTokenPackingService()
        token_result = service.process(tei_path=req_token.tei_path)

        print(f"INFO: Token packing completed for all models on {req_token.tei_path}")

    except Exception as e:
        print(f"ERROR: Token packing failed: {e}")
        raise HTTPException(status_code=500, detail=f"Token packing failed: {str(e)}")

    # Clean up temp file
    if temp_path and temp_path.exists():
        cleanup_temp_file(temp_path)

    # Return combined results
    return {
        "grobid_result": grobid_result,
        "tei_result": tei_result,
        "tokenizer_result": tokenizer_result,
        "token_result": token_result
    }

# Execute the pipeline
if __name__ == "__main__":
    # For Jupyter notebook, you can run this:
    result = asyncio.run(process_file_pipeline())
    print("Pipeline defined. Run 'asyncio.run(process_file_pipeline())' to execute.")


In [None]:
# Full fine-tuning pipeline for Polymer NLP Extractor

import os
import re
import gc
import torch
import random
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from typing import List, Dict, Any
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback
)
import wandb
from dotenv import load_dotenv

# ==== 1. LOAD CONFIGURATION ====
load_dotenv()
wandb.login(key=os.getenv("WANDB_API_KEY"))

from polymer_extractor.model_config import ENSEMBLE_MODELS, LABELS, LABEL2ID, ID2LABEL
from polymer_extractor.utils.paths import WORKSPACE_DIR
from polymer_extractor.services.constants.templates import SENTENCE_TEMPLATES
from polymer_extractor.services.constants.polymer_names import POLYMER_NAMES
from polymer_extractor.services.constants.property_names import PROPERTY_NAMES
from polymer_extractor.services.constants.scientific_units import SCIENTIFIC_UNITS
from polymer_extractor.services.constants.scientific_symbols import SCIENTIFIC_SYMBOLS
from polymer_extractor.services.constants.material_names import MATERIAL_NAMES
from polymer_extractor.services.constants.value_formats import VALUE_FORMATS

TRAINING_DIR = Path(WORKSPACE_DIR) / "datasets" / "training"
TESTING_DIR = Path(WORKSPACE_DIR) / "datasets" / "testing"
OUTPUT_DIR = Path(WORKSPACE_DIR) / "models" / "finetuned"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==== 2. DATASET LOADING ====

def get_real_datasets(override: bool = False, hours: int = 48) -> List[pd.DataFrame]:
    now = datetime.now()
    real_data = []
    for folder in [TRAINING_DIR, TESTING_DIR]:
        for file in folder.glob("*.csv"):
            mtime = datetime.fromtimestamp(file.stat().st_mtime)
            if override or (now - mtime) < timedelta(hours=hours):
                try:
                    df = pd.read_csv(file).dropna(subset=["sentence"]).reset_index(drop=True)
                    real_data.append(df)
                except Exception as e:
                    print(f"[WARN] Skipped {file.name}: {e}")
    return real_data

def generate_synthetic_data(n: int = 25000) -> List[Dict[str, str]]:
    data = []
    for _ in range(n):
        p, pr, u, s, v, m = map(random.choice, [
            POLYMER_NAMES, PROPERTY_NAMES, SCIENTIFIC_UNITS,
            SCIENTIFIC_SYMBOLS, VALUE_FORMATS, MATERIAL_NAMES
        ])
        sentence = random.choice(SENTENCE_TEMPLATES).format(
            polymer=p, property=pr, unit=u, symbol=s, value=v, material=m
        )
        data.append({
            "sentence": sentence,
            "polymer": p, "property": pr, "unit": u,
            "symbol": s, "value": v, "material": m
        })
    return data

# ==== 3. TOKENIZATION + LABELING ====

def tokenize_and_label(sample, tokenizer):
    sent = sample["sentence"]
    entities = {
        "POLYMER": sample.get("polymer", ""),
        "PROPERTY": sample.get("property", ""),
        "UNIT": sample.get("unit", ""),
        "SYMBOL": sample.get("symbol", ""),
        "VALUE": sample.get("value", ""),
        "MATERIAL": sample.get("material", "")
    }

    encoding = tokenizer(sent, return_offsets_mapping=True, truncation=True)
    tokens = encoding.tokens()
    offsets = encoding.offset_mapping
    label_map = ["O"] * len(sent)

    for ent, val in entities.items():
        if not val or len(val) < 2:
            continue
        for match in re.finditer(re.escape(val), sent):
            for i in range(match.start(), match.end()):
                label_map[i] = f"I-{ent}"
            label_map[match.start()] = f"B-{ent}"

    labels = []
    for start, end in offsets:
        if start == end:
            labels.append("O")
        else:
            span = label_map[start:end]
            label = next((l for l in span if l != "O"), "O")
            labels.append(label)

    return {
        "input_ids": encoding.input_ids,
        "attention_mask": encoding.attention_mask,
        "labels": [LABEL2ID.get(lbl, 0) for lbl in labels]
    }

def to_dataset(encodings: List[Dict[str, Any]]) -> Dataset:
    return Dataset.from_dict({
        "input_ids": [e["input_ids"] for e in encodings],
        "attention_mask": [e["attention_mask"] for e in encodings],
        "labels": [e["labels"] for e in encodings]
    })

# ==== 4. DATASET PREPARATION ====

override_real = False  # ← change to True to force all real data to be used
real_dfs = get_real_datasets(override=override_real)
real_samples = [{"sentence": row["sentence"]} for df in real_dfs for _, row in df.iterrows()]

print(f"[INFO] Found {len(real_samples)} real samples.")

# Create final training corpus
synthetic = generate_synthetic_data(n=25000)
corpus = synthetic + real_samples
random.shuffle(corpus)

split = int(len(corpus) * 0.9)
train_samples, val_samples = corpus[:split], corpus[split:]

print(f"[INFO] Training with {len(train_samples)} | Validation with {len(val_samples)}")

# ==== 5. TRAIN EACH MODEL ====

for model_cfg in ENSEMBLE_MODELS:
    print(f"\n[TRAINING] {model_cfg.name} ({model_cfg.model_id})")

    tokenizer = AutoTokenizer.from_pretrained(model_cfg.model_id, use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(
        model_cfg.model_id,
        num_labels=len(LABELS),
        id2label=ID2LABEL,
        label2id=LABEL2ID
    )

    train_encoded = [tokenize_and_label(s, tokenizer) for s in train_samples]
    val_encoded = [tokenize_and_label(s, tokenizer) for s in val_samples]
    train_ds = to_dataset(train_encoded)
    val_ds = to_dataset(val_encoded)

    out_path = OUTPUT_DIR / model_cfg.name
    out_path.mkdir(parents=True, exist_ok=True)

    args = TrainingArguments(
        output_dir=str(out_path),
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=model_cfg.training_config.get("lr", 2e-5),
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=model_cfg.training_config.get("epochs", 5),
        weight_decay=model_cfg.training_config.get("weight_decay", 0.01),
        gradient_accumulation_steps=4,
        logging_dir=str(out_path / "logs"),
        logging_steps=50,
        load_best_model_at_end=True,
        save_total_limit=2,
        fp16=torch.cuda.is_available(),
        report_to=["wandb"]
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    trainer.save_model(str(out_path))
    print(f"[SAVED] {model_cfg.name} → {out_path}")

    # Memory cleanup
    del trainer, model, tokenizer, train_ds, val_ds
    torch.cuda.empty_cache()
    gc.collect()
