Sentence Classification Examples:

    1. Sentiment Analysis
        1.1 Positive
        1.2 Negative
        I really hate this music -> Negative -> 0
        I love this food -> Positive -> 1
    2. Jumla qaysi tilda yozilganini aniqlash
    3. Names dataset classification
    


In [None]:
%%capture
!pip install unidecode

In [None]:
import torch
from google.colab import drive

drive.mount("/drive")

Mounted at /drive


In [None]:
from unidecode import unidecode

In [None]:
!ls /drive/MyDrive/data/names

Arabic.txt   English.txt  Irish.txt	Polish.txt	Spanish.txt
Chinese.txt  French.txt   Italian.txt	Portuguese.txt	Vietnamese.txt
Czech.txt    German.txt   Japanese.txt	Russian.txt
Dutch.txt    Greek.txt	  Korean.txt	Scottish.txt


In [None]:
import os
from glob import glob

In [None]:
root_dir = "/drive/MyDrive/data/names"
file_names = glob("*.txt", root_dir=root_dir)
unique_labels = sorted([os.path.splitext(file_name)[0] for file_name in file_names])
n_labels = len(unique_labels)

idx2label = {idx:label for idx, label in enumerate(unique_labels)}
label2idx = {label:idx for idx, label in idx2label.items()}

In [None]:
def replace(name, chars, target):
    for char in chars:
        name = name.replace(char, target)
    return name

In [None]:
X_names = []
Y_labels = []

for file_name in file_names:
    with open(os.path.join(root_dir, file_name), "rt", encoding='utf-8') as f:
        for line in f:
            name = line.strip().lower()
            name = unidecode(name)

            if name == 'to the first page':
                continue

            name = replace(name, [",", '1', "/b", ":", "\xa0"], '')
            name = replace(name, ['-'], ' ')

            X_names.append(name)
            Y_labels.append(os.path.splitext(file_name)[0])

1. Remove "To The First Page" names from dataset
2. Replace ",", '1', "/B", ":", \xa0 with empty string
3. Replace '-' with ' '
4. Convert all the following from unicode to ascii:
[ 'ß',
 'à',
 'á',
 'ã',
 'ä',
 'ç',
 'è',
 'é',
 'ê',
 'ì',
 'í',
 'ñ',
 'ò',
 'ó',
 'õ',
 'ö',
 'ù',
 'ú',
 'ü',
 'ą',
 'ł',
 'ń',
 'ś',
 'ż']

In [None]:
pad_token = '.'
pad_token_id = 0

unique_chars = [pad_token] + sorted(set(''.join(X_names)))
idx2char = {idx:char for idx, char in enumerate(unique_chars)}
char2idx = {char:idx for idx, char in idx2char.items()}

def encode(name: str) -> list[int]:
    return [char2idx[char] for char in name]

def decode(ids: list[int]) -> str:
    return ''.join(idx2char[i] for i in ids)

In [None]:
Y = [label2idx[label] for label in Y_labels]
X = [encode(name) for name in X_names]

In [None]:
for x, x_name, y, y_label in zip(X[:5], X_names[:5], Y[:5], Y_labels[:5]):
    print(f"{str(x):<35} -> {x_name:<10} {y} -> {y_label}")

[3, 3, 14, 21, 4, 23, 20, 9]        -> aalsburg   3 -> Dutch
[3, 3, 14, 21, 22]                  -> aalst      3 -> Dutch
[3, 3, 20, 14, 7]                   -> aarle      3 -> Dutch
[3, 5, 10, 22, 7, 20, 7, 16]        -> achteren   3 -> Dutch
[3, 5, 10, 22, 10, 17, 24, 7, 16]   -> achthoven  3 -> Dutch


1. Split data into train and test
2. NamesDataset for both train and test
3. Data Loader for both train and test with custom `collate` function

In [None]:
from sklearn.model_selection import  train_test_split

In [None]:
Xtr, Xts, Ytr, Yts = train_test_split(X, Y, test_size=0.2, stratify=Y)

In [None]:
from torch.utils.data import Dataset, DataLoader

class NamesDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

Dtr = NamesDataset(Xtr, Ytr)
Dts = NamesDataset(Xts, Yts)

In [None]:
global_max_n = 20

def collate_fn(batch):
    # print(batch)
    x, y = zip(*batch)
    max_n = max(len(row) for row in x)
    max_n = global_max_n if max_n > global_max_n else max_n
    x_padded = torch.zeros(len(x), max_n, dtype=torch.long)

    for i in range(len(x)):
        x_padded[i, :len(x[i])] = torch.tensor(x[i][:max_n])

    return x_padded, torch.tensor(y, dtype=torch.long)


Dltr = DataLoader(Dtr, batch_size=4, shuffle=True, drop_last=True, collate_fn=collate_fn)
Dlts = DataLoader(Dts, batch_size=4, shuffle=False, drop_last=False, collate_fn=collate_fn)

In [None]:
idx2label

{0: 'Arabic',
 1: 'Chinese',
 2: 'Czech',
 3: 'Dutch',
 4: 'English',
 5: 'French',
 6: 'German',
 7: 'Greek',
 8: 'Irish',
 9: 'Italian',
 10: 'Japanese',
 11: 'Korean',
 12: 'Polish',
 13: 'Portuguese',
 14: 'Russian',
 15: 'Scottish',
 16: 'Spanish',
 17: 'Vietnamese'}

In [None]:
# Define Model
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass

class  NamesClassifier(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.emb = nn.Embedding(self.config.vocab_size, self.config.n_embd) # B, T, C
        self.conv = nn.Conv1d(self.config.n_embd, self.config.n_conv_channels, self.config.kernel_size)

        self.max_pool = nn.AdaptiveMaxPool1d(1)
        self.drop = nn.Dropout(self.config.drop_rate)
        self.fc = nn.Linear(self.config.n_conv_channels, self.config.n_labels)

    def forward(self, x):
        x = self.emb(x)
        x = x.permute(0, 2, 1)
        x = self.conv(x)
        x = self.max_pool(x)
        x = self.drop(x.squeeze())
        x = self.fc(x)
        return x

@dataclass
class Config:
    vocab_size: int
    n_embd: int
    n_conv_channels: int
    kernel_size: int
    drop_rate: float
    n_labels: int

config = Config(vocab_size=29, n_embd=16, n_conv_channels=32, kernel_size=3, drop_rate=0.5, n_labels=18)
model = NamesClassifier(config)

In [None]:
model

NamesClassifier(
  (emb): Embedding(29, 16)
  (conv): Conv1d(16, 32, kernel_size=(3,), stride=(1,))
  (max_pool): AdaptiveMaxPool1d(output_size=1)
  (drop): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=32, out_features=18, bias=True)
)

In [None]:
# training loop
n_epochs = 3

for epoch in range(1, n_epochs+1):
    for x, y in Dltr:
        # take x and feed it to the model (not defined yet)
        logits = model(x)
        loss = F.cross_entropy(logits, y)
    # evaluate
    break
    # save the best model


In [None]:
accuracy, confusion matrix

In [None]:
logits = model(x)

In [None]:
logits[0]

tensor([ 0.2658,  0.1072, -0.8136,  0.4002,  0.0481, -0.1837, -1.2992,  0.0684,
        -0.4996,  0.5389, -0.3511, -0.7628,  0.1983, -0.4705,  0.5058, -0.0566,
         0.7549, -0.1128], grad_fn=<SelectBackward0>)

In [None]:
F.softmax(logits[0], dim=0)

tensor([0.0704, 0.0600, 0.0239, 0.0805, 0.0566, 0.0449, 0.0147, 0.0578, 0.0327,
        0.0925, 0.0380, 0.0252, 0.0658, 0.0337, 0.0895, 0.0510, 0.1148, 0.0482],
       grad_fn=<SoftmaxBackward0>)

In [None]:
F.softmax(logits[0], dim=0).argmax()

tensor(16)

In [None]:
F.softmax(logits[0], dim=0).shape

torch.Size([18])

In [None]:
logits[0].argmax()

tensor(16)