In [None]:
!pip install -q transformers datasets seqeval accelerate

from google.colab import files
import gdown
import zipfile
import os

file_id = 'file_id'
url = f'https://drive.google.com/uc?id={file_id}'


# Download the file
output = 'labeled_data.conll'
gdown.download(url, output, quiet=False)

zip_path = '/content/labeled_data.conll'
extract_dir = '/content/extracted_data'

# Make sure the directory exists
os.makedirs(extract_dir, exist_ok=True)

# Extract all files
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Extracted files:", os.listdir(extract_dir))

labeled_data = os.path.join(extract_dir, 'labeled_data.conll')



In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import classification_report
import os

# 1. Load CoNLL data
def load_conll_file(path):
    sentences, labels = [], []
    current_tokens, current_tags = [], []
    all_tags = set()

    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if current_tokens:
                    sentences.append(current_tokens)
                    labels.append(current_tags)
                    current_tokens, current_tags = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    token, tag = splits[0], splits[-1]
                    current_tokens.append(token)
                    current_tags.append(tag)
                    all_tags.add(tag)

    # Build vocab
    tag_list = sorted(all_tags)
    label2id = {label: i for i, label in enumerate(tag_list)}
    id2label = {i: label for label, i in label2id.items()}

    # Convert string tags to ids
    label_list = [[label2id[tag] for tag in seq] for seq in labels]

    # Create Dataset
    data = Dataset.from_dict({"tokens": sentences, "ner_tags": label_list})
    return data, label2id, id2label




In [None]:

# 2. Split & encode labels

model_name = "rasyosef/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset = dataset.train_test_split(test_size=0.2, seed=42)


# 3. Tokenize

def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples["tokens"], is_split_into_words=True,
        padding="max_length", truncation=True, max_length=128
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized["labels"] = labels
    return tokenized


In [None]:

# 4. Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
    )

# 5. Setup trainer
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/10Academy/NER_Models/Tiny-BERT-ner",
    run_name="Tiny-BERT-ner-run1",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,  # good on T4
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
)
data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    preds, labels = p
    preds = preds.argmax(-1)

    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_preds = [
        [id2label[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]

    # Avoid crashing on missing metrics
    report = classification_report(true_labels, true_preds, output_dict=True, zero_division=0)

    return {
        "accuracy": report.get("accuracy", 0.0),
        "f1": report["weighted avg"]["f1-score"],
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



In [None]:
# Train the model

trainer.train()



In [None]:
trainer.save_model("/content/ethioner-tiny-bert-amh")
tokenizer.save_pretrained("/content/ethioner-tiny-bert-amh")
save_path = "/content/drive/MyDrive/10Academy/NER_Models/Tiny-BERT-ner"

# Save model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)