In [1]:
!pip install transformers torch scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
from torch.nn import functional as F
from transformers import pipeline

In [3]:
import pandas as pd
import numpy as np

In [4]:
from torch import nn
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score

In [5]:
train_df = pd.read_csv('train.tsv', delimiter='\t', header=None, names=['Word', 'Tag'])
test_df = pd.read_csv('test.tsv', delimiter='\t', header=None, names=['Word', 'Tag'])
valid_df = pd.read_csv('valid.tsv', delimiter='\t', header=None, names=['Word', 'Tag'])


In [6]:
train_df

Unnamed: 0,Word,Tag
0,Tanzania,B-LOC
1,fi,O
2,Ajìjàgbara,O
3,Ọmọ,O
4,Orílẹ̀-èdèe,O
...,...,...
20232,Keosineam,O
20233,jẹ́,O
20234,asọ̀tàn,O
20235,.,O


In [7]:
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
valid_df.dropna(inplace=True)

In [8]:
words = train_df["Word"].tolist()
tags = train_df["Tag"].tolist()

In [9]:
len(tags)

19421

In [10]:
def create_label_map(df):
    unique_labels = df['Tag'].unique()
    label2id = {label: i for i, label in enumerate(sorted(unique_labels))}
    id2label = {i: label for label, i in label2id.items()}
    return label2id, id2label

In [11]:
label2id, id2label = create_label_map(train_df)

In [12]:
ner_tokenizer = AutoTokenizer.from_pretrained("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")
ner_model = AutoModelForTokenClassification.from_pretrained("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/404 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [13]:
class YorubaNERDataset(Dataset):
    def __init__(self, dataframe, tokenizer, label2id, max_length=128):
        self.data = self.prepare_data(dataframe)
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def prepare_data(self, df):
        # Group by implicit sentence breaks (NaN or empty rows)
        grouped = df.groupby((df['Word'].isna() | df['Word'].eq('')).cumsum())
        return [group[['Word', 'Tag']].dropna().values.tolist() for _, group in grouped]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = [word for word, _ in self.data[idx]]
        labels = [self.label2id[label] for _, label in self.data[idx]]

        encoding = self.tokenizer(sentence,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_length)

        word_ids = encoding.word_ids()
        label_ids = [-100] * len(word_ids)

        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                label_ids[idx] = labels[word_id]

        return {
            'input_ids': torch.tensor(encoding['input_ids']),
            'attention_mask': torch.tensor(encoding['attention_mask']),
            'labels': torch.tensor(label_ids)
        }

In [14]:
train_dataset = YorubaNERDataset(train_df, ner_tokenizer, label2id)
valid_dataset = YorubaNERDataset(valid_df, ner_tokenizer, label2id)
test_dataset = YorubaNERDataset(test_df, ner_tokenizer, label2id)

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class CNNForNER(nn.Module):
    def __init__(self, pretrained_model, num_classes, max_length=128):
        super(CNNForNER, self).__init__()
        self.transformer = pretrained_model
        self.max_length = max_length

        # Get the number of labels from the pretrained model
        pretrained_num_labels = self.transformer.num_labels

        self.conv1 = nn.Conv1d(in_channels=pretrained_num_labels, out_channels=256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(in_features=128, out_features=num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Shape: (batch_size, sequence_length, pretrained_num_labels)

        # Apply CNN layers
        logits = logits.permute(0, 2, 1)  # Shape: (batch_size, pretrained_num_labels, sequence_length)
        conv1_out = F.relu(self.conv1(logits))
        conv2_out = F.relu(self.conv2(conv1_out))
        conv2_out = self.dropout(conv2_out)
        conv2_out = conv2_out.permute(0, 2, 1)  # Shape: (batch_size, sequence_length, 128)
        final_logits = self.fc(conv2_out)  # Shape: (batch_size, sequence_length, num_classes)
        return final_logits

In [16]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

In [17]:
num_classes = len(label2id)
model = CNNForNER(ner_model, num_classes)

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

In [19]:
num_epochs = 5

In [20]:
def train_and_validate(model, train_dataloader, valid_dataloader, optimizer, loss_fn, num_epochs, device, patience=3):
    best_valid_loss = float('inf')
    best_model = None
    patience_counter = 0

    for epoch in range(num_epochs):
        # Training loop
        model.train()
        total_train_loss = 0
        train_predictions = []
        train_labels = []

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            predictions = torch.argmax(outputs, dim=2)
            train_predictions.extend(predictions[labels != -100].cpu().numpy())
            train_labels.extend(labels[labels != -100].cpu().numpy())

        avg_train_loss = total_train_loss / len(train_dataloader)
        train_f1 = f1_score(train_labels, train_predictions, average='weighted')

        # Validation loop
        model.eval()
        total_valid_loss = 0
        valid_predictions = []
        valid_labels = []

        with torch.no_grad():
            for batch in tqdm(valid_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = loss_fn(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
                total_valid_loss += loss.item()

                predictions = torch.argmax(outputs, dim=2)
                valid_predictions.extend(predictions[labels != -100].cpu().numpy())
                valid_labels.extend(labels[labels != -100].cpu().numpy())

        avg_valid_loss = total_valid_loss / len(valid_dataloader)
        valid_f1 = f1_score(valid_labels, valid_predictions, average='weighted')

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {avg_train_loss:.4f}, Train F1: {train_f1:.4f}")
        print(f"Valid Loss: {avg_valid_loss:.4f}, Valid F1: {valid_f1:.4f}")

        # Check for improvement
        if avg_valid_loss < best_valid_loss:
            best_valid_loss = avg_valid_loss
            best_model = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1

        # Early stopping
        if patience_counter >= patience:
            print(f"Early stopping triggered after epoch {epoch+1}")
            break

    return best_model

In [21]:
num_epochs = 10
best_model_state = train_and_validate(model, train_dataloader, valid_dataloader, optimizer, loss_fn, num_epochs, device)


Epoch 1/10 - Training: 100%|██████████| 1/1 [00:02<00:00,  2.53s/it]
Epoch 1/10 - Validation: 100%|██████████| 1/1 [00:00<00:00, 14.68it/s]


Epoch 1/10
Train Loss: 2.7072, Train F1: 0.0119
Valid Loss: 2.6217, Valid F1: 0.0085


Epoch 2/10 - Training: 100%|██████████| 1/1 [00:00<00:00,  2.12it/s]
Epoch 2/10 - Validation: 100%|██████████| 1/1 [00:00<00:00, 14.93it/s]


Epoch 2/10
Train Loss: 2.3816, Train F1: 0.0968
Valid Loss: 2.4503, Valid F1: 0.0085


Epoch 3/10 - Training: 100%|██████████| 1/1 [00:00<00:00,  2.10it/s]
Epoch 3/10 - Validation: 100%|██████████| 1/1 [00:00<00:00, 15.23it/s]


Epoch 3/10
Train Loss: 2.2165, Train F1: 0.1101
Valid Loss: 2.2969, Valid F1: 0.0084


Epoch 4/10 - Training: 100%|██████████| 1/1 [00:00<00:00,  2.05it/s]
Epoch 4/10 - Validation: 100%|██████████| 1/1 [00:00<00:00, 15.84it/s]


Epoch 4/10
Train Loss: 2.1409, Train F1: 0.1297
Valid Loss: 2.1622, Valid F1: 0.0084


Epoch 5/10 - Training: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s]
Epoch 5/10 - Validation: 100%|██████████| 1/1 [00:00<00:00, 24.61it/s]


Epoch 5/10
Train Loss: 2.0051, Train F1: 0.2590
Valid Loss: 2.0422, Valid F1: 0.0242


Epoch 6/10 - Training: 100%|██████████| 1/1 [00:00<00:00,  2.81it/s]
Epoch 6/10 - Validation: 100%|██████████| 1/1 [00:00<00:00, 23.21it/s]


Epoch 6/10
Train Loss: 1.9415, Train F1: 0.3680
Valid Loss: 1.9371, Valid F1: 0.0242


Epoch 7/10 - Training: 100%|██████████| 1/1 [00:00<00:00,  2.58it/s]
Epoch 7/10 - Validation: 100%|██████████| 1/1 [00:00<00:00, 25.65it/s]


Epoch 7/10
Train Loss: 1.8460, Train F1: 0.5119
Valid Loss: 1.8490, Valid F1: 0.1136


Epoch 8/10 - Training: 100%|██████████| 1/1 [00:00<00:00,  2.78it/s]
Epoch 8/10 - Validation: 100%|██████████| 1/1 [00:00<00:00, 25.60it/s]


Epoch 8/10
Train Loss: 1.8063, Train F1: 0.5897
Valid Loss: 1.7690, Valid F1: 0.7645


Epoch 9/10 - Training: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s]
Epoch 9/10 - Validation: 100%|██████████| 1/1 [00:00<00:00, 25.36it/s]


Epoch 9/10
Train Loss: 1.7349, Train F1: 0.7009
Valid Loss: 1.6963, Valid F1: 0.9487


Epoch 10/10 - Training: 100%|██████████| 1/1 [00:00<00:00,  2.79it/s]
Epoch 10/10 - Validation: 100%|██████████| 1/1 [00:00<00:00, 24.05it/s]

Epoch 10/10
Train Loss: 1.6864, Train F1: 0.7315
Valid Loss: 1.6300, Valid F1: 0.9732





In [22]:
# Evaluation on test set
model.eval()
test_loss = 0
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs.view(-1, num_classes), labels.view(-1))
        test_loss += loss.item()

        predictions = torch.argmax(outputs, dim=2)
        all_predictions.extend(predictions[labels != -100].cpu().numpy())
        all_labels.extend(labels[labels != -100].cpu().numpy())

avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss:.4f}")

Test Loss: 1.6324


In [23]:
# Load the best model
model.load_state_dict(best_model_state)

<All keys matched successfully>

In [24]:
torch.save(model.state_dict(), 'ner_model.pth')