In [None]:
! pip install sentencepiece



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load data
def load_data(file_path):
    data = pd.read_csv(file_path, delimiter='\t', header=None, names=['sentence', 'label'])
    return data


In [None]:
train_data = load_data('./train.tsv')
valid_data = load_data('./valid.tsv')
test_data = load_data('./test.tsv')


In [None]:
train_data.head()

Unnamed: 0,sentence,label
0,Tanzania,B-LOC
1,fi,O
2,Ajìjàgbara,O
3,Ọmọ,O
4,Orílẹ̀-èdèe,O


In [None]:
train_data.dropna(inplace=True)
valid_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [None]:
train_data

Unnamed: 0,sentence,label
0,Tanzania,B-LOC
1,fi,O
2,Ajìjàgbara,O
3,Ọmọ,O
4,Orílẹ̀-èdèe,O
...,...,...
20231,Khoeuk,O
20232,Keosineam,O
20233,jẹ́,O
20234,asọ̀tàn,O


In [None]:
# Path to your files
words_file_path = '/content/yor_wikipedia_2021_10K-words.txt'
sentences_file_path = '/content/yor_wikipedia_2021_10K-sentences.txt'



In [None]:
# Read and combine the contents
with open(words_file_path, 'r', encoding='utf-8') as words_file, \
     open(sentences_file_path, 'r', encoding='utf-8') as sentences_file, \
     open('combined_training_data.txt', 'w', encoding='utf-8') as combined_file:

    # Write words to the combined file, each on a new line
    for word in words_file:
        combined_file.write(word.strip() + '\n')

    # Write sentences to the combined file, each on a new line
    for sentence in sentences_file:
        combined_file.write(sentence.strip() + '\n')

print("Combined file created successfully.")


Combined file created successfully.


In [None]:
import sentencepiece as spm


In [None]:
# Train SentencePiece using the combined data file
spm.SentencePieceTrainer.train('--input=combined_training_data.txt --model_prefix=spm_yoruba --vocab_size=32000 --character_coverage=0.9995 --model_type=bpe')

In [None]:
# Load the trained model
sp = spm.SentencePieceProcessor()
sp.load('spm_yoruba.model')

True

In [None]:
def encode_sentences(sentence_list, sp_model):
    return [sp_model.encode_as_ids(sentence) for sentence in sentence_list]


In [None]:
train_encoded = encode_sentences(train_data['sentence'].astype(str).tolist(), sp)
valid_encoded = encode_sentences(valid_data['sentence'].astype(str).tolist(), sp)
test_encoded = encode_sentences(test_data['sentence'].astype(str).tolist(), sp)


In [None]:
def pad_sequences(encoded_sentences, max_len):
    padded_sequences = torch.zeros((len(encoded_sentences), max_len), dtype=torch.long)
    for i, seq in enumerate(encoded_sentences):
        length = min(max_len, len(seq))
        padded_sequences[i, :length] = torch.tensor(seq[:length], dtype=torch.long)
    return padded_sequences


In [None]:
max_len = 128  # or any other appropriate length
train_padded = pad_sequences(train_encoded, max_len)
valid_padded = pad_sequences(valid_encoded, max_len)
test_padded = pad_sequences(test_encoded, max_len)

In [None]:
!pip install scikit-learn

import torch
from sklearn.preprocessing import LabelEncoder

# Initialize a label encoder
label_encoder = LabelEncoder()

# Fit the encoder on all unique labels from all datasets
all_labels = train_data['label'].tolist() + valid_data['label'].tolist() + test_data['label'].tolist()
label_encoder.fit(all_labels)





In [None]:
# Transform labels to numerical representations
train_labels_encoded = label_encoder.transform(train_data['label'].tolist())
valid_labels_encoded = label_encoder.transform(valid_data['label'].tolist())
test_labels_encoded = label_encoder.transform(test_data['label'].tolist())


In [None]:
# Now create the tensors
train_labels = torch.tensor(train_labels_encoded, dtype=torch.long)
valid_labels = torch.tensor(valid_labels_encoded, dtype=torch.long)
test_labels = torch.tensor(test_labels_encoded, dtype=torch.long)

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Create TensorDatasets
train_dataset = TensorDataset(train_padded, train_labels)
valid_dataset = TensorDataset(valid_padded, valid_labels)
test_dataset = TensorDataset(test_padded, test_labels)

In [None]:

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class YorubaCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(YorubaCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(embed_dim, 128, 5, padding=1)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(128, 64, kernel_size=3, padding=1)
        self.dropout = nn.Dropout(0.5)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool1d(kernel_size=2)
        self.conv3 = nn.Conv1d(64, 32, kernel_size=3, padding=1)
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(64 * (embed_dim), 128)
        self.fc2 = nn.Linear(128, num_classes)  # Adjust fc layer


    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)  # Permute for Conv1d: batch_size x embed_dim x seq_len
        x = self.conv1(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.dropout(x)
        x = self.relu2(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.dropout(x)
        x = self.relu3(x)
        x = self.pool3(x)

        # Dynamically calculate input size for the fully connected layer
        x = torch.flatten(x, 1)  # Flatten to prepare for the fully connected layer
        in_features = x.shape[1]
        self.fc = nn.Linear(in_features, self.fc.out_features)  # Adjust fc layer
        self.fc.to(x.device)  # Ensure fc is on the same device as input

        x = self.fc(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [None]:
# Model initialization
vocab_size = sp.get_piece_size()  # Number of tokens in the SentencePiece model
embed_dim = 256
num_classes = len(train_data['label'].unique())
model = YorubaCNN(vocab_size, embed_dim, num_classes)


In [None]:
from torch.optim import Adam

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss()

In [None]:
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for sentence, labels in data_loader:
        sentence, labels = sentence.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(sentence)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for sentence, labels in data_loader:
            sentence, labels = sentence.to(device), labels.to(device)
            outputs = model(sentence)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(data_loader)


In [None]:
# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


YorubaCNN(
  (embedding): Embedding(32000, 256)
  (conv1): Conv1d(256, 128, kernel_size=(5,), stride=(1,), padding=(1,))
  (dropout): Dropout(p=0.5, inplace=False)
  (relu): ReLU()
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu2): ReLU()
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(64, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu3): ReLU()
  (pool3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=16384, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=9, bias=True)
)

In [29]:
# Training loop
num_epochs = 30
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    valid_loss = evaluate(model, valid_loader, criterion, device)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss}, Valid Loss: {valid_loss}')


Epoch 1, Train Loss: 0.5277088544817865, Valid Loss: 1.2072282370399026
Epoch 2, Train Loss: 0.4822376758624267, Valid Loss: 1.3318537698072546
Epoch 3, Train Loss: 0.47836534140395276, Valid Loss: 1.2532376429613898
Epoch 4, Train Loss: 0.47027628493834367, Valid Loss: 1.1996828661245458
Epoch 5, Train Loss: 0.4696219570419246, Valid Loss: 1.134067963852602
Epoch 6, Train Loss: 0.469300275346298, Valid Loss: 1.0918279563679414
Epoch 7, Train Loss: 0.46412338415863486, Valid Loss: 1.0611516742145315
Epoch 8, Train Loss: 0.4673239453718729, Valid Loss: 0.994757055535036
Epoch 9, Train Loss: 0.4603569605947768, Valid Loss: 0.8122517869752996
Epoch 10, Train Loss: 0.40535599547492024, Valid Loss: 0.4578837875057669
Epoch 11, Train Loss: 0.31253232311877793, Valid Loss: 0.42954842904034785
Epoch 12, Train Loss: 0.26297382569965433, Valid Loss: 0.379318871568231
Epoch 13, Train Loss: 0.24279396561752312, Valid Loss: 0.4132897145607892
Epoch 14, Train Loss: 0.23820341507696333, Valid Loss: 0

In [30]:
model.eval()

YorubaCNN(
  (embedding): Embedding(32000, 256)
  (conv1): Conv1d(256, 128, kernel_size=(5,), stride=(1,), padding=(1,))
  (dropout): Dropout(p=0.5, inplace=False)
  (relu): ReLU()
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu2): ReLU()
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(64, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu3): ReLU()
  (pool3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=480, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=9, bias=True)
)

In [31]:
# Assuming your model is called 'model'
model_save_path = './yoruba_ner_model.pth'

# Save the entire model
torch.save(model, model_save_path)

# # Alternatively, save only the state dict (recommended for flexibility)
# torch.save(model.state_dict(), model_save_path)

OrderedDict([('embedding.weight',
              tensor([[ 5.9940e-01,  4.4266e-03, -4.5102e-02,  ...,  9.8259e-03,
                       -7.9259e-03, -4.0927e-02],
                      [ 2.2592e-38,  5.7848e-38, -6.3370e-38,  ...,  2.5116e-38,
                        3.4431e-39, -1.1700e-39],
                      [-1.6042e-38, -6.2684e-39,  5.9911e-39,  ..., -2.0708e-38,
                       -2.3868e-38,  1.1104e-38],
                      ...,
                      [-1.3183e-38,  5.2702e-38, -2.9224e-38,  ..., -2.6706e-39,
                       -1.4705e-38,  5.9826e-38],
                      [ 4.4933e-38, -2.5312e-38,  1.5097e-38,  ..., -1.9861e-38,
                       -2.2453e-38, -4.4558e-38],
                      [-3.6120e-38,  3.7116e-38,  1.7498e-39,  ..., -7.4109e-39,
                        1.1676e-38, -3.2310e-38]])),
             ('conv1.weight',
              tensor([[[-1.7636e-02, -5.7468e-02, -5.6886e-02, -8.6085e-03,  1.5258e-02],
                       [ 2.286

In [None]:
model_info = {
    'model_state_dict': model.state_dict(),
    'vocab': vocab,  # Your vocabulary object
    'label_map': label_map,  # Your label mapping
    'embed_dim': embed_dim,
    'num_classes': num_classes,
    # Add any other relevant information
}

torch.save(model_info, model_save_path)

In [32]:
# If you saved the entire model
loaded_model = torch.load(model_save_path)

# If you saved the state dict
# model = YorubaCNN(vocab_size, embed_dim, num_classes)
# model.load_state_dict(torch.load(model_save_path))
# model.eval()  # Set the model to evaluation mode

In [None]:

# If you saved additional info
model_info = torch.load(model_save_path)
model = YorubaCNN(len(model_info['vocab']), model_info['embed_dim'], model_info['num_classes'])
model.load_state_dict(model_info['model_state_dict'])
model.eval()

In [34]:
loaded_model.eval()

YorubaCNN(
  (embedding): Embedding(32000, 256)
  (conv1): Conv1d(256, 128, kernel_size=(5,), stride=(1,), padding=(1,))
  (dropout): Dropout(p=0.5, inplace=False)
  (relu): ReLU()
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu2): ReLU()
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(64, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu3): ReLU()
  (pool3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=480, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=9, bias=True)
)