In [2]:
! pip install sentencepiece



In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder

In [4]:
# Load data
def load_data(file_path):
    data = pd.read_csv(file_path, delimiter='\t', header=None, names=['sentence', 'label'])
    return data


In [5]:
train_data = load_data('./train.tsv')
valid_data = load_data('./valid.tsv')
test_data = load_data('./test.tsv')


In [6]:
train_data.head()

Unnamed: 0,sentence,label
0,Tanzania,B-LOC
1,fi,O
2,Ajìjàgbara,O
3,Ọmọ,O
4,Orílẹ̀-èdèe,O


In [7]:
train_data.dropna(inplace=True)
valid_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [8]:
train_data

Unnamed: 0,sentence,label
0,Tanzania,B-LOC
1,fi,O
2,Ajìjàgbara,O
3,Ọmọ,O
4,Orílẹ̀-èdèe,O
...,...,...
20231,Khoeuk,O
20232,Keosineam,O
20233,jẹ́,O
20234,asọ̀tàn,O


In [9]:
# Path to your files
words_file_path = '/content/yor_wikipedia_2021_10K-words.txt'
sentences_file_path = '/content/yor_wikipedia_2021_10K-sentences.txt'



In [10]:
# Read and combine the contents
with open(words_file_path, 'r', encoding='utf-8') as words_file, \
     open(sentences_file_path, 'r', encoding='utf-8') as sentences_file, \
     open('combined_training_data.txt', 'w', encoding='utf-8') as combined_file:

    # Write words to the combined file, each on a new line
    for word in words_file:
        combined_file.write(word.strip() + '\n')

    # Write sentences to the combined file, each on a new line
    for sentence in sentences_file:
        combined_file.write(sentence.strip() + '\n')

print("Combined file created successfully.")


Combined file created successfully.


In [11]:
import sentencepiece as spm


In [12]:
# Train SentencePiece using the combined data file
spm.SentencePieceTrainer.train('--input=combined_training_data.txt --model_prefix=spm_yoruba --vocab_size=32000 --character_coverage=0.9995 --model_type=bpe')

In [13]:
# Load the trained model
sp = spm.SentencePieceProcessor()
sp.load('spm_yoruba.model')

True

In [14]:
def encode_sentences(sentence_list, sp_model):
    return [sp_model.encode_as_ids(sentence) for sentence in sentence_list]


In [15]:
train_encoded = encode_sentences(train_data['sentence'].astype(str).tolist(), sp)
valid_encoded = encode_sentences(valid_data['sentence'].astype(str).tolist(), sp)
test_encoded = encode_sentences(test_data['sentence'].astype(str).tolist(), sp)


In [16]:
def pad_sequences(encoded_sentences, max_len):
    padded_sequences = torch.zeros((len(encoded_sentences), max_len), dtype=torch.long)
    for i, seq in enumerate(encoded_sentences):
        length = min(max_len, len(seq))
        padded_sequences[i, :length] = torch.tensor(seq[:length], dtype=torch.long)
    return padded_sequences


In [17]:
max_len = 128  # or any other appropriate length
train_padded = pad_sequences(train_encoded, max_len)
valid_padded = pad_sequences(valid_encoded, max_len)
test_padded = pad_sequences(test_encoded, max_len)

In [22]:
!pip install scikit-learn

import torch
from sklearn.preprocessing import LabelEncoder

# Initialize a label encoder
label_encoder = LabelEncoder()

# Fit the encoder on all unique labels from all datasets
all_labels = train_data['label'].tolist() + valid_data['label'].tolist() + test_data['label'].tolist()
label_encoder.fit(all_labels)





In [23]:
# Transform labels to numerical representations
train_labels_encoded = label_encoder.transform(train_data['label'].tolist())
valid_labels_encoded = label_encoder.transform(valid_data['label'].tolist())
test_labels_encoded = label_encoder.transform(test_data['label'].tolist())


In [24]:
# Now create the tensors
train_labels = torch.tensor(train_labels_encoded, dtype=torch.long)
valid_labels = torch.tensor(valid_labels_encoded, dtype=torch.long)
test_labels = torch.tensor(test_labels_encoded, dtype=torch.long)

In [25]:
from torch.utils.data import DataLoader, TensorDataset

# Create TensorDatasets
train_dataset = TensorDataset(train_padded, train_labels)
valid_dataset = TensorDataset(valid_padded, valid_labels)
test_dataset = TensorDataset(test_padded, test_labels)

In [26]:

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [59]:
import torch.nn as nn
import torch.nn.functional as F

class YorubaCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(YorubaCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(embed_dim, 128, 5, padding=1)
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(128, 64, kernel_size=3, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool1d(kernel_size=2)
        self.conv3 = nn.Conv1d(64, 32, kernel_size=3, padding=1)
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool1d(kernel_size=2)
        self.fc = nn.Linear(64 * (embed_dim), 128)
        self.fc2 = nn.Linear(128, num_classes)  # Adjust fc layer


    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)  # Permute for Conv1d: batch_size x embed_dim x seq_len
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.relu3(x)
        x = self.pool3(x)

        # Dynamically calculate input size for the fully connected layer
        x = torch.flatten(x, 1)  # Flatten to prepare for the fully connected layer
        in_features = x.shape[1]
        self.fc = nn.Linear(in_features, self.fc.out_features)  # Adjust fc layer
        self.fc.to(x.device)  # Ensure fc is on the same device as input

        x = self.fc(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [60]:
# Model initialization
vocab_size = sp.get_piece_size()  # Number of tokens in the SentencePiece model
embed_dim = 256
num_classes = len(train_data['label'].unique())
model = YorubaCNN(vocab_size, embed_dim, num_classes)


In [61]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [62]:
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for sentence, labels in data_loader:
        sentence, labels = sentence.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(sentence)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for sentence, labels in data_loader:
            sentence, labels = sentence.to(device), labels.to(device)
            outputs = model(sentence)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(data_loader)


In [63]:
# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


YorubaCNN(
  (embedding): Embedding(32000, 256)
  (conv1): Conv1d(256, 128, kernel_size=(5,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu2): ReLU()
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(64, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu3): ReLU()
  (pool3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=16384, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=9, bias=True)
)

In [64]:
# Training loop
num_epochs = 30
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    valid_loss = evaluate(model, valid_loader, criterion, device)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss}, Valid Loss: {valid_loss}')


Epoch 1, Train Loss: 0.5527350629796466, Valid Loss: 0.6189571568797179
Epoch 2, Train Loss: 0.4069771256537178, Valid Loss: 0.4085470567566945
Epoch 3, Train Loss: 0.2793241171376309, Valid Loss: 0.3710368135012686
Epoch 4, Train Loss: 0.24097636559152671, Valid Loss: 0.35706297171521273
Epoch 5, Train Loss: 0.22307557728986563, Valid Loss: 0.3681016509202035
Epoch 6, Train Loss: 0.21193626163741405, Valid Loss: 0.38704010534976774
Epoch 7, Train Loss: 0.20462025965247888, Valid Loss: 0.39524479062186885
Epoch 8, Train Loss: 0.19927314469482232, Valid Loss: 0.3835757231394596
Epoch 9, Train Loss: 0.19391058612106343, Valid Loss: 0.4180294503334581
Epoch 10, Train Loss: 0.1909343829185547, Valid Loss: 0.4115348575992838
Epoch 11, Train Loss: 0.18781334513335812, Valid Loss: 0.4395089162041104
Epoch 12, Train Loss: 0.18573254401631778, Valid Loss: 0.5297066718559055
Epoch 13, Train Loss: 0.1825594782750148, Valid Loss: 0.4652536341406898
Epoch 14, Train Loss: 0.17991135105031544, Valid 

In [66]:
model.eval()

YorubaCNN(
  (embedding): Embedding(32000, 256)
  (conv1): Conv1d(256, 128, kernel_size=(5,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu2): ReLU()
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(64, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu3): ReLU()
  (pool3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=480, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=9, bias=True)
)