## Environment Setup

In [None]:
%%capture
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    !pip install transformers datasets
    !pip install scikit-multilearn
    #!pip install pyarrow datasets -q
    #!pip install 'huggingface_hub[cli]'
    #!huggingface cli login
    # Create project directory in Google Drive
    !mkdir -p "/content/swedish_historical_ner"

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Controls whether models and datasets are pre-loaded from drive or not
load_from_pretrained = False

In [None]:
import json
import ast
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata
import shutil
import nbformat
import copy
import random
from tqdm import tqdm
from collections import defaultdict
from IPython import get_ipython
from IPython.display import display, HTML
from collections import Counter, defaultdict
from transformers import AutoTokenizer, AutoModel, BertForTokenClassification, MegatronBertForTokenClassification, Trainer, TrainingArguments, AutoModelForMaskedLM
from transformers import XLNetTokenizer, XLNetForTokenClassification, DebertaV2Tokenizer, DebertaV2ForTokenClassification, AutoModelForTokenClassification, EarlyStoppingCallback
from IPython.display import display, Markdown
from datasets import load_dataset, Features, Sequence, Value, load_from_disk, Dataset
from typing import Dict, List, Tuple, Optional
from skmultilearn.model_selection import IterativeStratification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from google.colab import drive, userdata, _message

# Set up project directories
if IN_COLAB:
    BASE_DIR = "/content/swedish_historical_ner"
else:
    BASE_DIR = "."

DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")
RESULTS_DIR = os.path.join(BASE_DIR, "results")

# Create directories if they don't exist
for directory in [DATA_DIR, MODELS_DIR, RESULTS_DIR]:
    os.makedirs(directory, exist_ok=True)

# Set random seeds for reproducibility
np.random.seed(69)

## Load Kubhist data

These are ancient swedish texts from the news back in the day.

In [None]:
import requests
import json
import time
import os

# Updated API URL
KORP_API_URL = "https://ws.spraakbanken.gu.se/ws/korp/v8"
OUTPUT_DIR = "/Users/danielyebra/Desktop/Uni/Subjects/NLP & Text Mining/Assignments/Assignment 2/Data"
CHUNK_SIZE = 1000  # Number of sentences per request
TARGET_SENTENCES = 100000  # Aim for this many sentences

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

def list_available_corpora():
    """Lists all available corpora from the Korp API."""
    try:
        # Use the info endpoint to get corpus information
        response = requests.get(f"{KORP_API_URL}/info")
        print(f"API URL: {KORP_API_URL}/info")
        print(f"Status code: {response.status_code}")

        # Debug: Print raw response first few characters
        response_text = response.text[:500]  # First 500 chars to avoid overwhelming output
        print(f"Response preview: {response_text}...")

        data = response.json()
        print("Response JSON structure:", type(data))

        # Print the top-level keys to understand structure
        if isinstance(data, dict):
            print("Top-level keys:", data.keys())

        # Check different possible structures for the corpora information
        kubhist_corpora = []
        all_corpora = []

        if isinstance(data, dict):
            if "corpora" in data:
                corpora_data = data["corpora"]

                # Handle if corpora is a dictionary
                if isinstance(corpora_data, dict):
                    all_corpora = list(corpora_data.keys())
                # Handle if corpora is a list
                elif isinstance(corpora_data, list):
                    # Try to extract corpus IDs from list items
                    for corpus in corpora_data:
                        if isinstance(corpus, dict) and "id" in corpus:
                            all_corpora.append(corpus["id"])
                        elif isinstance(corpus, str):
                            all_corpora.append(corpus)
            # Alternative structure seen in some Korp APIs
            elif "corpora_info" in data:
                corpora_data = data["corpora_info"]
                if isinstance(corpora_data, dict):
                    all_corpora = list(corpora_data.keys())
            # Another alternative structure
            elif "corpus_list" in data:
                all_corpora = data["corpus_list"]

        # If we still have no corpora, try a direct corpus listing endpoint
        if not all_corpora:
            print("Trying alternative corpus listing endpoint...")
            try:
                alt_response = requests.get(f"{KORP_API_URL}/corpus_list")
                alt_data = alt_response.json()
                print("Alternative endpoint structure:", type(alt_data))

                if isinstance(alt_data, list):
                    all_corpora = alt_data
                elif isinstance(alt_data, dict) and "corpora" in alt_data:
                    all_corpora = alt_data["corpora"]
            except Exception as e:
                print(f"Alternative endpoint error: {e}")

        # Find Kubhist corpora
        for corpus in all_corpora:
            corpus_id = corpus if isinstance(corpus, str) else corpus.get("id", "")
            if "kubhist" in corpus_id.lower():
                kubhist_corpora.append(corpus_id)
                print(f"  - {corpus_id} (KUBHIST CORPUS)")

        # Print all corpora for reference
        if all_corpora:
            print("\nAll available corpora:")
            for corpus in sorted(all_corpora):
                corpus_id = corpus if isinstance(corpus, str) else corpus.get("id", "")
                print(f"  - {corpus_id}")
        else:
            print("No corpora found in API response.")

        return kubhist_corpora
    except Exception as e:
        print(f"Error listing corpora: {e}")
        return []

def fetch_korp_data(corpus_id, start_index, count):
    """Fetches a chunk of data from the Korp API."""
    params = {
        "command": "query",
        "corpus": corpus_id,
        "cqp": "[]",
        "start": start_index,
        "end": start_index + count - 1,
        "defaultcontext": "1 sentence",
        "show": ["word"]
    }
    try:
        response = requests.get(f"{KORP_API_URL}/query", params=params, timeout=60)
        if response.status_code != 200:
            print(f"API returned status code {response.status_code}")
            print("API Response:", response.text)
            return None
        return response.json()
    except Exception as e:
        print(f"Error: {e}")
        return None

def extract_sentences(korp_response):
    """Extracts full sentences from the Korp API response."""
    sentences = set()  # Use a set to avoid duplicates
    if not korp_response or "kwic" not in korp_response:
        if korp_response and "ERROR" in korp_response:
            print(f"API Error: {korp_response['ERROR']}")
        return []

    for hit in korp_response.get("kwic", []):
        tokens = hit.get("tokens", [])
        sentence = " ".join(token["word"] for token in tokens if "word" in token)
        if sentence:
            sentences.add(sentence.strip())

    return list(sentences)

def download_kubhist_data(corpus_id):
    """Downloads data from the specified corpus and saves it to a file."""
    all_sentences = []
    total_fetched = 0

    print(f"Starting download from corpus: {corpus_id}")

    while total_fetched < TARGET_SENTENCES:
        print(f"Fetching sentences {total_fetched} to {total_fetched + CHUNK_SIZE - 1}...")
        data = fetch_korp_data(corpus_id, total_fetched, CHUNK_SIZE)

        if not data:
            print("Failed to fetch data. Stopping.")
            break

        if "ERROR" in data:
            print(f"API Error: {data['ERROR']}")
            break

        sentences = extract_sentences(data)

        if not sentences:
            if data.get("hits", 0) == 0:
                print("No more hits found in the corpus.")
                break
            else:
                print("Warning: Got API hits but failed to extract sentences.")
                # Print a sample hit for debugging
                if "kwic" in data and data["kwic"]:
                    print("Sample hit:", json.dumps(data["kwic"][0], indent=2))

        all_sentences.extend(sentences)
        current_count = len(sentences)
        total_fetched += CHUNK_SIZE

        print(f"  Fetched {current_count} unique sentences in this batch.")
        print(f"  Total unique sentences so far: {len(all_sentences)}")

        # Check if we've reached the end of available hits
        if "corpus_hits" in data:
            # Debug output to see the structure of corpus_hits
            print(f"corpus_hits value: {data['corpus_hits']}")

            # Handle different types for corpus_hits
            corpus_hits = data['corpus_hits']
            corpus_hits_count = 0

            if isinstance(corpus_hits, int):
                corpus_hits_count = corpus_hits
            elif isinstance(corpus_hits, dict):
                # If it's a dict, try to sum the values or get the max
                try:
                    corpus_hits_count = sum(corpus_hits.values())
                except (TypeError, ValueError):
                    # If values can't be summed, take the max value
                    try:
                        corpus_hits_count = max(corpus_hits.values())
                    except (TypeError, ValueError):
                        # If that fails too, just use a value larger than total_fetched
                        corpus_hits_count = total_fetched + CHUNK_SIZE

            print(f"Total corpus hits: {corpus_hits_count}")

            if total_fetched >= corpus_hits_count:
                print(f"Reached the end of the corpus ({corpus_hits_count} total hits).")
                break

        # Add a small delay to be nice to the API
        time.sleep(1)

    print(f"Download finished. Collected {len(all_sentences)} unique sentences.")

    # Save to file with the corpus ID in the filename
    output_file = os.path.join(OUTPUT_DIR, f"{corpus_id.lower()}_corpus.txt")
    if all_sentences:
        with open(output_file, "w", encoding="utf-8") as f:
            for sentence in all_sentences:
                f.write(sentence + "\n")

        print(f"Successfully saved data to {output_file}")

        # Show sample for verification
        print("\n--- Sample Sentences ---")
        for i, sentence in enumerate(all_sentences[:5]):
            print(f"{i+1}: {sentence}")
        print("------------------------")
    else:
        print("No sentences were collected.")

if __name__ == "__main__":
    print("Step 1: Finding available corpora...")
    kubhist_corpora = list_available_corpora()

    # Filter for only KUBHIST1 corpora
    kubhist1_corpora = [c for c in kubhist_corpora if c.upper().startswith("KUBHIST2")]

    if kubhist1_corpora:
        print("\nFound the following KUBHIST1 corpora:", kubhist1_corpora)
        corpus_to_use = kubhist1_corpora[0]  # Use the first KUBHIST1 corpus found
        print(f"Using corpus: {corpus_to_use}")

        # Prompt to continue with download
        choice = input("Continue with download? (y/n): ").strip().lower()
        if choice == 'y':
            download_kubhist_data(corpus_to_use)
        else:
            print("Download cancelled.")
    else:
        print("\nNo KUBHIST1 corpora found in the API listing. Let's try common IDs directly:")

        # Try common KUBHIST1 corpus IDs, prioritizing those
        test_corpora = [
            "KUBHIST1", "kubhist1", "KUBHIST1-1750-1950", "kubhist1-1750-1950",
            "KUBHIST1_1750_1950", "kubhist1_1750_1950",
            "KUB-HIST-1", "kub-hist-1", "KUB_HIST_1", "kub_hist_1"
        ]

        # Also try some known working corpus IDs to validate the API connection
        test_corpora.extend(["kubhist", "KUBHIST", "suc3", "ROMI", "SUC"])

        print("\nTesting direct queries to the following corpus IDs:")
        for corpus in test_corpora:
            print(f"  - {corpus}")

        success_corpora = []

        # Test each corpus with a minimal query
        for corpus in test_corpora:
            print(f"\nTesting corpus: {corpus}")
            try:
                test_data = fetch_korp_data(corpus, 0, 1)  # Just get one result to test
                if test_data and "ERROR" not in test_data and test_data.get("hits", 0) > 0:
                    print(f"✓ SUCCESS: Corpus '{corpus}' is valid and returned results")
                    success_corpora.append(corpus)
                elif test_data and "ERROR" in test_data:
                    print(f"✗ ERROR: {test_data['ERROR']}")
                else:
                    print(f"✗ No hits returned for corpus '{corpus}'")
            except Exception as e:
                print(f"✗ Exception: {e}")

            # Small delay between requests
            time.sleep(0.5)

        if success_corpora:
            print("\nThe following corpus IDs are working:")
            for corpus in success_corpora:
                print(f"  - {corpus}")

            # Use the first working KUBHIST1 corpus or fallback to any working KUBHIST corpus
            kubhist1_working = [c for c in success_corpora if c.upper().startswith("KUBHIST1")]
            kubhist_working = [c for c in success_corpora if "kubhist" in c.lower()]

            if kubhist1_working:
                recommended = kubhist1_working[0]
                print(f"\nRecommended KUBHIST1 corpus: {recommended}")
            elif kubhist_working:
                recommended = kubhist_working[0]
                print(f"\nNo KUBHIST1 corpus found, but found KUBHIST corpus: {recommended}")
            else:
                recommended = success_corpora[0]
                print(f"\nNo KUBHIST corpus found. Using alternative corpus: {recommended}")

            choice = input("Continue with download using this corpus? (y/n): ").strip().lower()
            if choice == 'y':
                download_kubhist_data(recommended)
            else:
                manual_corpus = input("Enter a corpus ID manually (or 'skip' to exit): ").strip()
                if manual_corpus.lower() != 'skip':
                    download_kubhist_data(manual_corpus)
                else:
                    print("Script execution completed.")
        else:
            print("\nNo working corpus IDs found!")
            manual_corpus = input("Enter a corpus ID manually (or 'skip' to exit): ").strip()
            if manual_corpus.lower() != 'skip':
                download_kubhist_data(manual_corpus)
            else:
                print("Script execution completed.")

# Domain Adapt

In [None]:
import os
import torch
import json
import logging
import random # Added for shuffling
from google.colab import drive, files # Added files for download explanation
from datasets import Dataset, DatasetDict
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
# Data directory in Google Drive where the corpus files are located
DRIVE_DATA_DIR = "/content/drive/MyDrive/swedish_ner_data"
# Target files to merge
TARGET_CORPUS_FILES = [
    "kubhist-dalpilen-1900_corpus.txt",
    "kubhist2-aftonbladet-1830_corpus.txt"
]
# Where to save the final adapted model in the Colab environment
COLAB_MODEL_SAVE_DIR = "/content/adapted_model_merged"

# Create save directory in Colab's local storage
os.makedirs(COLAB_MODEL_SAVE_DIR, exist_ok=True)

def find_corpus_files(target_files):
    """Find the specified target corpus files in Google Drive"""
    found_files = []
    missing_files = []
    for filename in target_files:
        file_path = os.path.join(DRIVE_DATA_DIR, filename)
        if os.path.exists(file_path):
            logger.info(f"Found target corpus file: {filename}")
            found_files.append(file_path)
        else:
            logger.warning(f"Target corpus file not found: {file_path}")
            missing_files.append(filename)

    if missing_files:
        logger.error(f"Missing required corpus files: {', '.join(missing_files)}")
        return None # Indicate failure if required files are missing

    if not found_files:
        logger.error("No corpus files found.")
        return None

    return found_files

def load_corpus_data(file_paths):
    """Load text data from a list of corpus files, merge, and shuffle."""
    all_texts = []
    total_loaded_count = 0

    for file_path in file_paths:
        if os.path.exists(file_path):
            logger.info(f"Loading data from {file_path}...")
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    file_texts = f.readlines()

                def clean_text_with_empty_lines(texts):
                    result = []
                    prev_empty = False
                    for text in texts:
                        stripped = text.strip()
                        if stripped or (not stripped and not prev_empty):  # Keep single empty lines
                            result.append(text.rstrip().lstrip())  # Preserve internal spaces
                        prev_empty = not stripped
                    return result

                cleaned_texts = clean_text_with_empty_lines(file_texts)

                all_texts.extend(cleaned_texts)
                logger.info(f"  Loaded {len(cleaned_texts)} sentences from {os.path.basename(file_path)}")
                total_loaded_count += len(cleaned_texts)
            except Exception as e:
                logger.error(f"Error loading {file_path}: {e}")
        else:
            logger.warning(f"File not found during loading: {file_path}")

    logger.info(f"Total sentences loaded from all files: {total_loaded_count}")

    if not all_texts:
        logger.error("No text data loaded from any specified file.")
        return []

    return all_texts

def create_dataset(texts):
    """Create a HuggingFace dataset from text data"""
    # Create train/test splits (e.g., 95/5)
    train_size = int(0.95 * len(texts))
    if train_size == len(texts): # Ensure test set is not empty if dataset is small
        train_size -= 1

    train_texts = texts[:train_size]
    test_texts = texts[train_size:]

    if not test_texts: # Handle case with very few sentences
         logger.warning("Dataset too small for a separate test set. Using a small portion of train set for evaluation.")
         test_texts = train_texts[-max(1, int(0.01 * len(train_texts))):] # Use last 1% or at least 1 example

    # Create dataset dictionary
    dataset_dict = DatasetDict({
        'train': Dataset.from_dict({'text': train_texts}),
        'test': Dataset.from_dict({'text': test_texts})
    })

    logger.info(f"Created dataset with {len(train_texts)} training and {len(test_texts)} test examples")
    return dataset_dict

def tokenize_function(examples, tokenizer):
    """Tokenize text for MLM task"""
    # Tokenize with padding to max length in batch
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256, # Keep reduced length for efficiency
        return_special_tokens_mask=True
    )

def apply_fast_domain_adaptation(model, num_layers_to_train=2):
    """
    Implement fast domain adaptation by freezing most of the model,
    only training embeddings and the first few layers
    """
    # First freeze all parameters
    for param in model.parameters():
        param.requires_grad = False

    # Count total parameters before unfreezing
    total_params = sum(p.numel() for p in model.parameters())

    # Unfreeze embeddings (these are critical for domain adaptation)
    for param in model.roberta.embeddings.parameters():
        param.requires_grad = True

    # Unfreeze first n transformer layers
    for i in range(min(num_layers_to_train, len(model.roberta.encoder.layer))):
        for param in model.roberta.encoder.layer[i].parameters():
            param.requires_grad = True

    # If MLM head is separate, make sure it's trainable
    if hasattr(model, 'lm_head'):
        for param in model.lm_head.parameters():
            param.requires_grad = True
    elif hasattr(model, 'cls'):
        for param in model.cls.parameters():
            param.requires_grad = True

    # Count trainable parameters after unfreezing
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    frozen_params = total_params - trainable_params

    logger.info(f"Model has {total_params:,} total parameters")
    logger.info(f"Training {trainable_params:,} parameters ({trainable_params/total_params:.1%})")
    logger.info(f"Freezing {frozen_params:,} parameters ({frozen_params/total_params:.1%})")

    return model

Mounted at /content/drive


In [None]:
def install_accelerate():
    """Install required dependencies for training"""
    try:
        # Check if accelerate is installed
        import importlib.util
        if importlib.util.find_spec("accelerate") is None:
            logger.info("Installing accelerate...")
            import subprocess
            subprocess.check_call(["pip", "install", "accelerate"])
            logger.info("Accelerate installed successfully")
    except Exception as e:
        logger.error(f"Failed to install dependencies: {e}")

def main():
    logger.info("Starting domain adaptation from scratch on merged dataset...")

    # Install Accelerate if needed
    install_accelerate()

    # Find corpus files
    corpus_files = find_corpus_files(TARGET_CORPUS_FILES)
    if not corpus_files:
        logger.error("Required corpus files not found. Exiting.")
        return

    # Load and merge corpus data
    logger.info("Loading and merging corpus data...")
    texts = load_corpus_data(corpus_files)

    if not texts:
        logger.error("No text data loaded. Exiting.")
        return

    # Sample and log a few examples from the merged, shuffled data
    logger.info("Sample sentences from merged corpus:")
    for i, text in enumerate(texts[:3]):
        logger.info(f"  {i+1}: {text[:100]}...")

    # Create dataset
    dataset = create_dataset(texts)

    # Load BASE model and tokenizer
    logger.info("Loading BASE XLM-RoBERTa model and tokenizer...")
    model_name = "FacebookAI/xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
    try:
        model = XLMRobertaForMaskedLM.from_pretrained(
            model_name
        )
        logger.info("Loaded base PyTorch weights successfully with Flash Attention 2 enabled.")
    except OSError as e_base:
        logger.warning(f"Failed to load base PyTorch weights: {e_base}. Attempting TF...")
        try:
            model = XLMRobertaForMaskedLM.from_pretrained(model_name, from_tf=True)
            logger.info("Loaded base TensorFlow weights and converted.")
        except Exception as e_tf:
             logger.error(f"Failed to load model weights from both PyTorch and TF: {e_tf}")
             return # Cannot proceed without a model

    # Apply fast domain adaptation (freeze most layers)
    logger.info("Applying fast domain adaptation strategy (freezing most layers)...")
    model = apply_fast_domain_adaptation(model, num_layers_to_train=2)

    # Gradient Checkpointing: Keep enabled for now, can disable later if RAM allows
    model.gradient_checkpointing_enable()

    # Tokenize dataset with caching
    logger.info("Tokenizing dataset...")
    tokenized_datasets = dataset.map(
        lambda examples: tokenize_function(examples, tokenizer),
        batched=True,
        batch_size=1000,
        remove_columns=["text"],
        num_proc=4,
        desc="Tokenizing dataset",
        load_from_cache_file=True
    )

    # Create data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.15
    )

    # Check GPU
    device_info = "GPU" if torch.cuda.is_available() else "CPU"
    logger.info(f"Training will run on {device_info}")
    if torch.cuda.is_available():
        logger.info(f"GPU Model: {torch.cuda.get_device_name(0)}")

    # Set up training arguments for training from scratch
    batch_size = 32 # Increased batch size
    gradient_accumulation = 4
    num_epochs = 5 # Suitable for training from scratch
    num_layers_to_train=2 # Define this here for logging consistency

    training_args = TrainingArguments(
      output_dir=COLAB_MODEL_SAVE_DIR,
      overwrite_output_dir=True,
      num_train_epochs=num_epochs,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      gradient_accumulation_steps=gradient_accumulation,
      evaluation_strategy="no",
      save_strategy="epoch",  # Save at the end of each epoch
      learning_rate=5e-5,
      weight_decay=0.01,
      warmup_ratio=0.1,
      logging_dir=os.path.join(COLAB_MODEL_SAVE_DIR, "logs"),
      logging_steps=50,
      fp16=True,
      dataloader_num_workers=4,
      dataloader_pin_memory=True,
      group_by_length=True,
      push_to_hub=False,
      report_to="none",
      optim="adamw_torch_fused",
      ddp_find_unused_parameters=False,
)

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
    )

    # Train the model from scratch
    logger.info("Starting domain adaptation training from scratch...")
    logger.info(f"Training on merged data from: {', '.join(os.path.basename(f) for f in corpus_files)}")
    # Ensure num_layers_to_train is defined before this log line or use the value directly
    logger.info(f"Training only embeddings and first {num_layers_to_train} layers")
    logger.info(f"Per-device batch size: {batch_size}, Grad Acc Steps: {gradient_accumulation}")
    logger.info(f"Effective batch size: {batch_size * gradient_accumulation}")
    logger.info(f"Using Flash Attention 2: {'True' if hasattr(model.config, 'attn_implementation') and model.config.attn_implementation == 'flash_attention_2' else 'False'}")

    trainer.train() # No resume_from_checkpoint

    # --- Explicitly save the final model and tokenizer ---
    logger.info(f"Training complete. Saving final model state to {COLAB_MODEL_SAVE_DIR}")
    trainer.save_model(COLAB_MODEL_SAVE_DIR)
    tokenizer.save_pretrained(COLAB_MODEL_SAVE_DIR)
    logger.info(f"Final model and tokenizer saved successfully.")

    # Evaluate the final model AFTER saving
    logger.info("Evaluating final model...")
    eval_results = trainer.evaluate()
    logger.info(f"Final evaluation results: {eval_results}")

    # Save a README
    readme_path = os.path.join(COLAB_MODEL_SAVE_DIR, "README.md")
    merged_filenames = ', '.join(os.path.basename(f) for f in corpus_files)
    try:
        with open(readme_path, "w") as f:
            f.write(f"# Domain-Adapted XLM-RoBERTa (Merged Data)\n\n")
            f.write(f"Model adapted from `FacebookAI/xlm-roberta-large`.\n\n")
            f.write(f"## Training Information\n\n")
            f.write(f"- Base model: {model_name}\n")
            f.write(f"- Training data: Merged from {merged_filenames}\n")
            f.write(f"- Total sentences: {len(texts)}\n")
            f.write(f"- Training epochs: {training_args.num_train_epochs}\n")
            f.write(f"- Batch size: {batch_size} x {gradient_accumulation} = {batch_size * gradient_accumulation} effective\n")
            f.write(f"- Learning rate: {training_args.learning_rate}\n")
            f.write(f"- Sequence length: 256\n")
            f.write(f"- Fast adaptation: Trained embeddings and first {num_layers_to_train} layers\n")
            f.write(f"- Optimizations: Gradient checkpointing, FP16\n")
            f.write(f"## Final Evaluation Results\n\n")
            f.write(f"(Evaluation performed after final model save)\n") # Clarify when eval happened
            f.write(f"```json\n{json.dumps(eval_results, indent=2)}\n```\n")
    except Exception as e:
        logger.error(f"Failed to write README.md: {e}")

    logger.info("Domain adaptation from scratch complete!")
    logger.info(f"Model saved in Colab at: {COLAB_MODEL_SAVE_DIR}")

    # --- Download Instructions ---
    print("\n--- TO DOWNLOAD THE MODEL ---")
    print(f"The trained model files are saved in the Colab environment at: {COLAB_MODEL_SAVE_DIR}")
    print("Option 1: Using Colab's File Browser")
    print("  1. Click the 'Files' icon (folder symbol) on the left sidebar in Colab.")
    print(f"  2. Navigate to the '{os.path.basename(COLAB_MODEL_SAVE_DIR)}' directory.")
    print("  3. Right-click the directory and select 'Download'. This will zip the folder and download it.")
    print("Option 2: Zipping and downloading manually (run this code in a new cell):")
    print("```python")
    print(f"import shutil")
    print(f"shutil.make_archive('/content/adapted_model_archive', 'zip', '{COLAB_MODEL_SAVE_DIR}')")
    print(f"from google.colab import files")
    print(f"files.download('/content/adapted_model_archive.zip')")
    print("```")
    print(f"After downloading, unzip the file into your target directory: /Users/danielyebra/Downloads")
    print("-----------------------------")


if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at FacebookAI/xlm-roberta-large were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Tokenizing dataset (num_proc=4):   0%|          | 0/189920 [00:00<?, ? examples/s]

Tokenizing dataset (num_proc=4):   0%|          | 0/9996 [00:00<?, ? examples/s]



Step,Training Loss
50,4.1631
100,4.074
150,4.0123
200,3.9816
250,3.8903
300,3.8249
350,3.8192
400,3.7963
450,3.7632
500,3.6894


Step,Training Loss
50,4.1631
100,4.074
150,4.0123
200,3.9816
250,3.8903
300,3.8249
350,3.8192
400,3.7963
450,3.7632
500,3.6894



--- TO DOWNLOAD THE MODEL ---
The trained model files are saved in the Colab environment at: /content/adapted_model_merged
Option 1: Using Colab's File Browser
  1. Click the 'Files' icon (folder symbol) on the left sidebar in Colab.
  2. Navigate to the 'adapted_model_merged' directory.
  3. Right-click the directory and select 'Download'. This will zip the folder and download it.
Option 2: Zipping and downloading manually (run this code in a new cell):
```python
import shutil
shutil.make_archive('/content/adapted_model_archive', 'zip', '/content/adapted_model_merged')
from google.colab import files
files.download('/content/adapted_model_archive.zip')
```
After downloading, unzip the file into your target directory: /Users/danielyebra/Downloads
-----------------------------


In [None]:
import shutil
import os

# Define source and destination paths
source_path = "/content/adapted_model_merged/checkpoint-7415"
destination_path = "/content/drive/MyDrive/swedish_historical_ner_results/adapted_model"

# Ensure destination directory exists
os.makedirs(destination_path, exist_ok=True)

# Copy the checkpoint to the destination
shutil.copytree(source_path, destination_path, dirs_exist_ok=True)

print(f"Checkpoint copied from {source_path} to {destination_path}")

Checkpoint copied from /content/adapted_model_merged/checkpoint-7415 to /content/drive/MyDrive/swedish_historical_ner_results/adapted_model
