In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [None]:
!pip install pythainlp



# 1) Tokenization
The goal of tokenization here is to

1) Convert string to a list of token strings -- so we need a word segmenter/tokenizer here

2) Convert a list of token strings to a list of token indices -- so we need
- a dictionary to keep index --> word
- a dictionary to keep word --> index

3) Pad or truncate to the same length

In [None]:
import pythainlp
class Tokenizer:

    def __init__(self, seq_length):
        self.word2idx = {'<PAD>':0}
        self.idx2word = {0:'<PAD>'}
        self.UNK = None
        self.word_segmenter = pythainlp.word_tokenize
        self.seq_length = seq_length

    def tokenize_training_set(self, text_list):
        training_set = []
        for text in text_list:
            tokens = self.word_segmenter(text)
            for token in tokens:
                if token not in self.word2idx:
                    index = len(self.word2idx)
                    self.word2idx[token] = index
                    self.idx2word[index] = token
            tokens = [self.word2idx[token] for token in tokens]
            tokens = self.pad_or_truncate(tokens)
            training_set.append(tokens)
        self.UNK = len(self.word2idx)
        self.word2idx['<UNK>'] = self.UNK
        self.idx2word[self.UNK] = '<UNK>'
        return training_set

    def pad_or_truncate(self, index_list):
        if len(index_list) >= self.seq_length:
            return index_list[:self.seq_length]
        return index_list + [self.word2idx['<PAD>']] * (self.seq_length - len(index_list))

    def tokenize_dataset(self, text_list):
        list_list_tokens = [self.word_segmenter(text) for text in text_list]
        list_list_indices = [[self.word2idx.get(token, self.UNK) for token in tokens] for tokens in list_list_tokens]
        list_list_indices = [self.pad_or_truncate(indices) for indices in list_list_indices]
        return list_list_indices

    def tokenize_sentence(self, text):
        list_indices = [self.word2idx.get(token, self.UNK) for token in self.word_segmenter(text)]
        list_indices = self.pad_or_truncate(list_indices)
        return list_indices


ModuleNotFoundError: No module named 'pythainlp'

# 2) Create DataSet container and DataLoader

We subclass `DataSet` class to store a list of tokens and labels. The goal is to

1) keep X and Y in one place.

2) Make sure that train dev and test sets

We must implement three methods: `__init__`, `__len__`, and `__getitem__`.

We feed then the dataset to `DataLoader`, which helps us batch data for training (or inference)


In [None]:
class TextDataset(Dataset):

    def __init__(self, X, Y, tokenizer):
        # count indices that are not padding
        self.lengths = [sum(1 for x in indices if x != tokenizer.word2idx['<PAD>']) for indices in X]
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)  # Number of samples in dataset

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.lengths[idx]), torch.tensor(self.Y[idx], dtype=torch.long)

# 3) Set up the model

We will subclass `torch.nn.Module` because it works well with training functions provided by Torch. We will implement two methods `__init__` and `forward`


In [None]:
class DeepAveragingNetwork(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(DeepAveragingNetwork, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x, lengths):
        # x (batch, seq_len)
        # lengths (batch)
        embeds = self.embedding(x)  # (batch, seq_len, embed_dim)
        sum_embeds = torch.sum(embeds, dim=1)  # (batch, embed_dim)
        avg_embeds = sum_embeds / lengths.unsqueeze(1).float()
        hidden = self.relu(self.fc1(avg_embeds))
        output = self.softmax(self.fc2(hidden)) #(batch, num class)
        return output


# 4) Load up the data

In [None]:
# Training set
train_set = [
    ("มีสาขาที่ใกล้ที่สุดอยู่ที่ไหน?", "Question"),
    ("ร้านปิดกี่โมง?", "Question"),
    ("ส่งของใช้เวลากี่วัน?", "Question"),
    ("มีโปรโมชั่นอะไรบ้าง?", "Question"),
    ("ช่วยโอนสายไปที่เจ้าหน้าที่ได้ไหม?", "Ask for agent"),
    ("ขอคุยกับพนักงานหน่อย", "Ask for agent"),
    ("มีแอดมินตอบไหม?", "Ask for agent"),
    ("ต้องการคุยกับฝ่ายบริการลูกค้า", "Ask for agent"),
    ("สินค้าส่งมาผิด!", "Complaint"),
    ("ได้รับของเสียหาย", "Complaint"),
    ("พัสดุยังไม่มาส่งเลย", "Complaint"),
    ("ทำไมยังไม่ได้รับคำตอบ?", "Complaint"),
    ("ของที่ส่งมาขาดชิ้นส่วน", "Complaint"),
    ("พนักงานพูดจาไม่สุภาพ", "Complaint"),
    ("ระบบใช้งานไม่ได้", "Complaint"),
]

# Development set
dev_set = [
    ("มีบริการเก็บเงินปลายทางไหม?", "Question"),
    ("สามารถเปลี่ยนที่อยู่จัดส่งได้หรือเปล่า?", "Question"),
    ("สินค้านี้มีสีอื่นไหม?", "Question"),
    ("ค่าจัดส่งเท่าไหร่?", "Question"),
    ("ขอโอนสายไปฝ่ายบัญชีได้ไหม?", "Ask for agent"),
    ("มีใครช่วยตอบคำถามได้ไหม?", "Ask for agent"),
    ("ช่วยหาคนที่รับผิดชอบให้หน่อย", "Ask for agent"),
    ("ต้องการพูดคุยกับฝ่ายเทคนิค", "Ask for agent"),
    ("ทำไมของที่ส่งมาใช้ไม่ได้?", "Complaint"),
    ("ได้รับของไม่ครบตามที่สั่ง", "Complaint"),
    ("แอปเด้งตลอดเวลา ใช้ไม่ได้เลย", "Complaint"),
    ("พนักงานไม่ช่วยแก้ปัญหาให้", "Complaint"),
    ("ระบบบอกว่าส่งแล้วแต่ยังไม่ได้รับของ", "Complaint"),
    ("ของที่ได้รับไม่ตรงกับที่โฆษณา", "Complaint"),
    ("ช่องทางติดต่อยากมาก", "Complaint"),
]

# Test set
test_set = [
    ("สามารถยกเลิกคำสั่งซื้อได้ไหม?", "Question"),
    ("วิธีขอคืนเงินต้องทำอย่างไร?", "Question"),
    ("ใช้โค้ดส่วนลดอย่างไร?", "Question"),
    ("เมื่อไหร่สินค้าจะกลับมาในสต็อก?", "Question"),
    ("ช่วยโอนสายไปที่ผู้จัดการได้ไหม?", "Ask for agent"),
    ("ต้องการให้เจ้าหน้าที่โทรกลับ", "Ask for agent"),
    ("มีคนคอยช่วยเหลืออยู่ไหม?", "Ask for agent"),
    ("ช่วยติดต่อฝ่ายสนับสนุนให้หน่อย", "Ask for agent"),
    ("ส่งของผิดที่", "Complaint"),
    ("ของที่ได้รับมีรอยขีดข่วน", "Complaint"),
    ("ทำไมยังไม่มีการอัปเดตสถานะพัสดุ?", "Complaint"),
    ("ขอคืนเงินแต่ยังไม่ได้รับเงิน", "Complaint"),
    ("ได้รับสินค้าหมดอายุ", "Complaint"),
    ("แพ็กเกจของฉีกขาด", "Complaint"),
    ("พนักงานตอบช้ามาก", "Complaint"),
]

tokenizer = Tokenizer(seq_length=10)
tokenized_train_set = tokenizer.tokenize_training_set([text for text, label in train_set])
tokenized_dev_set = tokenizer.tokenize_dataset([text for text, label in dev_set])
tokenized_test_set = tokenizer.tokenize_dataset([text for text, label in test_set])

label_dict = {'Question': 0, 'Ask for agent': 1, 'Complaint': 2}
Y_train = [label_dict[label] for text, label in train_set]
Y_dev = [label_dict[label] for text, label in dev_set]
Y_test = [label_dict[label] for text, label in test_set]

train_dataset = TextDataset(tokenized_train_set, Y_train, tokenizer)
dev_dataset = TextDataset(tokenized_dev_set, Y_dev, tokenizer)
test_dataset = TextDataset(tokenized_test_set, Y_test, tokenizer)


# 5) Train

In [None]:
# Model parameters
VOCAB_SIZE = len(tokenizer.word2idx)
EMBED_DIM = 50
HIDDEN_DIM = 50
OUTPUT_DIM = 3

# Instantiate model
model = DeepAveragingNetwork(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, OUTPUT_DIM)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)



In [None]:
# prompt: Train the model for 5 epochs and evaluate on dev_dataset every epoch

# Training loop
EPOCHS = 5
BATCH_SIZE = 2

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)

for epoch in range(EPOCHS):
    model.train()
    for batch in train_dataloader:
        X, lengths, Y = batch
        optimizer.zero_grad()
        predictions = model(X, lengths)
        loss = criterion(predictions, Y)
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {loss.item():.4f}")

    # Evaluate on the development set
    model.eval()
    with torch.no_grad():
        dev_loss = 0
        for batch in dev_dataloader:
            X, lengths, Y = batch
            predictions = model(X, lengths)
            loss = criterion(predictions, Y)
            dev_loss += loss.item()
        avg_dev_loss = dev_loss/len(dev_dataloader)
        print(f"Epoch {epoch + 1}/{EPOCHS}, Dev Loss: {avg_dev_loss:.4f}")


Epoch 1/5, Loss: 1.0979
Epoch 1/5, Loss: 1.1265
Epoch 1/5, Loss: 1.0399
Epoch 1/5, Loss: 1.1092
Epoch 1/5, Loss: 1.0390
Epoch 1/5, Loss: 1.1533
Epoch 1/5, Loss: 1.0562
Epoch 1/5, Loss: 1.0735
Epoch 1/5, Dev Loss: 1.0715
Epoch 2/5, Loss: 0.8230
Epoch 2/5, Loss: 0.8895
Epoch 2/5, Loss: 1.1082
Epoch 2/5, Loss: 0.9630
Epoch 2/5, Loss: 0.9414
Epoch 2/5, Loss: 0.8723
Epoch 2/5, Loss: 0.7567
Epoch 2/5, Loss: 1.0580
Epoch 2/5, Dev Loss: 1.0097
Epoch 3/5, Loss: 0.6892
Epoch 3/5, Loss: 0.8620
Epoch 3/5, Loss: 0.7167
Epoch 3/5, Loss: 0.8302
Epoch 3/5, Loss: 0.7750
Epoch 3/5, Loss: 0.7402
Epoch 3/5, Loss: 0.8170
Epoch 3/5, Loss: 0.5609
Epoch 3/5, Dev Loss: 0.9762
Epoch 4/5, Loss: 0.5934
Epoch 4/5, Loss: 0.5630
Epoch 4/5, Loss: 0.5811
Epoch 4/5, Loss: 0.5606
Epoch 4/5, Loss: 0.6984
Epoch 4/5, Loss: 0.7167
Epoch 4/5, Loss: 0.5915
Epoch 4/5, Loss: 0.7228
Epoch 4/5, Dev Loss: 0.9425
Epoch 5/5, Loss: 0.5543
Epoch 5/5, Loss: 0.5800
Epoch 5/5, Loss: 0.5741
Epoch 5/5, Loss: 0.5534
Epoch 5/5, Loss: 0.6225
