## Semi-Supervised Implementation
### Mandarin Word Segmentation Using BiLSTMs


Import statements.

In [1]:
import torch
from torch import nn
import torch.optim

from torchtext import data
from torchtext import datasets

import numpy as np

import time
import random

Environment variables. **Set `train_file` and `test_file` to the relative filepaths of the data.** If `test_file` is an empty string no test data will be used.
The validation split determines the percentage of training samples set aside for validation.

In [2]:
train_file = "data/train.tsv"
test_file = ""
val_split = 0.3

Set random seed for reproducability.

In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Declare the `TEXT` and `TAG` fields. In this implementation, the TAG field represent whether or not a character is the end of a word.

In [4]:
TEXT = data.Field(lower = True)
TAGS = data.Field(unk_token = None)



In [5]:
fields = (("text", TEXT), ("tags", TAGS))

I again had to modify the `SequenceTaggingDataset` from torchtext. This time rather than specifying a character for a new example, I divided the examples into 500-character chunks.

In [20]:
class SequenceTaggingDataset(data.Dataset):
    @staticmethod
    def sort_key(example):
        for attr in dir(example):
            if not callable(getattr(example, attr)) and \
                    not attr.startswith("__"):
                return len(getattr(example, attr))
        return 0

    def __init__(self, path, fields, val_split=0, encoding="utf-8", separator="\t", **kwargs):
        print("Loading data...")
        examples = []
        columns = []

        with open(path, encoding=encoding) as input_file:
            for idx, line in enumerate(input_file):
                line = line.strip()
                if columns and idx % 500 == 0:
                    examples.append(data.Example.fromlist(columns, fields))
                    columns = []
                for i, column in enumerate(line.split(separator)):
                    if len(columns) < i + 1:
                        columns.append([])
                    columns[i].append(column)
            if columns:
                examples.append(data.Example.fromlist(columns, fields))
        print("Data loaded from {}".format(path))
        super(SequenceTaggingDataset, self).__init__(examples, fields,
                                                     **kwargs)

Load the data into a Pytorch dataset and split based on the provided `val_split`. Load the test dataset if one is provided.

In [35]:
train_data, val_data = SequenceTaggingDataset(train_file, fields).split(split_ratio=1-val_split)
if len(test_file) > 0:
    test_data = SequenceTaggingDataset(test_file, fields)

Loading data...
Data loaded from data/train.tsv


In [36]:
print("Training samples: {}".format(len(train_data)))
print("Validation samples: {}".format(len(val_data)))
if "test_data" in globals():
    print("Testing samples: {}".format(len(test_data)))

Training samples: 11716
Validation samples: 5021


Quick sanity check.

In [37]:
print(vars(train_data.examples[0]))

{'text': ['婆', '婆', '在', '長', '期', '的', '耳', '濡', '目', '染', '之', '下', '，', '也', '都', '是', '玩', '模', '型', '的', '高', '手', '。', '每', '當', '假', '日', '無', '處', '去', '時', '，', '全', '家', '陶', '醉', '在', '模', '型', '世', '界', '中', '，', '其', '樂', '融', '融', '。', '一', '種', '視', '覺', '上', '錯', '誤', '的', '反', '應', '現', '象', '，', '錯', '視', '早', '就', '被', '發', '現', '了', '，', '我', '們', '的', '眼', '睛', '受', '到', '環', '境', '的', '影', '響', '做', '出', '錯', '誤', '的', '判', '斷', '時', '，', '直', '線', '可', '能', '看', '成', '曲', '線', '，', '平', '行', '線', '可', '能', '看', '成', '歪', '斜', '線', '，', '失', '之', '毫', '釐', '，', '差', '以', '千', '里', '，', '有', '時', '錯', '的', '瘋', '狂', '，', '錯', '的', '離', '譜', '。', '大', '家', '常', '說', '眼', '見', '為', '憑', '，', '由', '於', '我', '們', '對', '眼', '睛', '的', '信', '賴', '程', '度', '，', '遠', '超', '過', '其', '他', '的', '知', '覺', '感', '觀', '，', '一', '旦', '看', '見', '與', '事', '實', '不', '相', '符', '的', '圖', '形', '時', '，', '第', '一', '個', '反', '應', '是', '不', '相', '信', '，', '非', '得', '以', '規', '矩', '實', '量', 

Build the vocab. I'm only including words that appear twice or more in the embeddings. Any unseen words or words with only one occurrence will be judged solely on the surrounding tags.

In [38]:
MIN_FREQ = 2

TEXT.build_vocab(train_data,
                 min_freq = MIN_FREQ)
TAGS.build_vocab(train_data)

In [39]:
print("Number unique tokens in TEXT: {}".format(len(TEXT.vocab)))
print("Unique tokens in TAG: {}".format(TAGS.vocab.itos))

Number unique tokens in TEXT: 5454
Unique tokens in TAG: ['<pad>', '1', '0']


Set the batch size and the GPU if one is available. **I was only able to run this in a reasonable amount of time using a GPU**.
Then create the iterators to produce batches.

In [40]:
BATCH_SIZE = 128

device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
print(device)

train_iterator, val_iterator = data.BucketIterator.splits(
    (train_data, val_data),
    batch_size = BATCH_SIZE,
    device = device
)
if "test_data" in globals():
    test_iterator = data.BucketIterator(test_data, batch_size = BATCH_SIZE, device = device
)

cuda:2




Declare the model class. I used the same model as the Celtic Mutations project. The only changes required were hyperparameter modifications.

In [41]:
class WordSegmenter(nn.Module):
    def __init__(self,
                 input_dim,
                 embedding_dim,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout,
                 pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout if n_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        outputs, (hidden, cell) = self.lstm(embedded)
        predictions = self.fc(self.dropout(outputs))

        return predictions


HYPERPARAMETERS

In [59]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = len(TAGS.vocab)
N_LAYERS = 4
BIDIRECTIONAL = True
DROPOUT = 0.3
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = WordSegmenter(INPUT_DIM,
                        EMBEDDING_DIM,
                        HIDDEN_DIM,
                        OUTPUT_DIM,
                        N_LAYERS,
                        BIDIRECTIONAL,
                        DROPOUT,
                        PAD_IDX)

Since I'm not using pretrained weights this time, initialize the embedding weights to have a Gaussian distribution.

In [60]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean = 0, std = 0.1)

model.apply(init_weights)

WordSegmenter(
  (embedding): Embedding(5454, 100, padding_idx=1)
  (lstm): LSTM(100, 128, num_layers=4, dropout=0.3, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

Print trainable parameters to judge size of the model. It's fairly large, which explains the GPU requirement.

In [61]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("{} trainable parameters".format(count_parameters(model)))

1967483 trainable parameters


Set weights for padding to zero to ignore their affect.

In [62]:
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)


tensor([[-5.7248e-02,  2.0565e-01,  5.2836e-02,  ...,  4.8831e-02,
         -4.9211e-02,  1.1771e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 9.4289e-02, -5.3422e-02, -1.1588e-01,  ...,  1.3952e-02,
          1.6497e-01,  6.2637e-02],
        ...,
        [ 1.6150e-02,  7.4582e-02,  1.4738e-02,  ..., -1.5252e-01,
         -7.4020e-02,  1.4197e-01],
        [ 7.3277e-02,  3.1269e-02,  1.7995e-02,  ...,  1.6358e-01,
          2.2453e-02,  1.3270e-01],
        [-8.1434e-02,  7.5084e-02, -1.2482e-01,  ..., -1.0065e-04,
          2.5365e-01,  8.0984e-02]])


Standard Adam optimizer with self-generated learning rate.

In [63]:
optimizer = torch.optim.Adam(model.parameters())

`CrossEntropyLoss`, ignoring any outputs from padding tags since every word has an output, not just the whole sentence.

In [64]:
TAG_PAD_IDX = TAGS.vocab.stoi[TAGS.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX)

Send the model and loss to the GPU is available.

In [65]:
model = model.to(device)
criterion.to(device)

CrossEntropyLoss()

Determine accuracy. This was pretty much a copy and paste from [this repo](https://github.com/bentrevett/pytorch-pos-tagging).

In [66]:
def categorical_accuracy(preds, y, tag_pad_idx):
    max_preds = preds.argmax(dim = 1, keepdim = True)
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

In [None]:
def precision(preds, y, tag_pad_idx):
    max_preds = preds.argmax(dim = 1, keepdim = True)
    non_pad_elements = (y != tag_pad_idx).nonzero()
    

Standard train and eval functions.

In [67]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        text = batch.text
        tags = batch.tags

        optimizer.zero_grad()

        predictions = model(text.to(device))

        # reshape predictions since pytorch can't handle 3-dimensional predictions
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)

        loss = criterion(predictions, tags.to(device))

        acc = categorical_accuracy(predictions.cpu(), tags.cpu(), tag_pad_idx)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [68]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            tags = batch.tags

            predictions = model(text.to(device))

            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)

            loss = criterion(predictions, tags.to(device))
            acc = categorical_accuracy(predictions.cpu(), tags.cpu(), tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

Train for 10 epochs.

In [None]:
N_EPOCHS = 100

best_val_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, TAG_PAD_IDX)
    val_loss, val_acc = evaluate(model, val_iterator, criterion, TAG_PAD_IDX)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'model.pt')

    print("Epoch: {}".format(epoch+1))
    print(f"Train Loss: {train_loss:.3f} | Train Acc: {train_acc:.3f}")
    print(f"Val Loss: {val_loss:.3f} | Val Acc: {val_acc:.3f}")

Epoch: 1
Train Loss: 0.568 | Train Acc: 0.699
Val Loss: 0.299 | Val Acc: 0.874
Epoch: 2
Train Loss: 0.250 | Train Acc: 0.898
Val Loss: 0.202 | Val Acc: 0.920
Epoch: 3
Train Loss: 0.200 | Train Acc: 0.921
Val Loss: 0.173 | Val Acc: 0.934
Epoch: 4
Train Loss: 0.175 | Train Acc: 0.932
Val Loss: 0.151 | Val Acc: 0.942
Epoch: 5
Train Loss: 0.159 | Train Acc: 0.939
Val Loss: 0.141 | Val Acc: 0.947
Epoch: 6
Train Loss: 0.147 | Train Acc: 0.944
Val Loss: 0.131 | Val Acc: 0.951
Epoch: 7
Train Loss: 0.138 | Train Acc: 0.948
Val Loss: 0.123 | Val Acc: 0.955
Epoch: 8
Train Loss: 0.130 | Train Acc: 0.951
Val Loss: 0.116 | Val Acc: 0.957
Epoch: 9
Train Loss: 0.123 | Train Acc: 0.954
Val Loss: 0.112 | Val Acc: 0.960
Epoch: 10
Train Loss: 0.118 | Train Acc: 0.956
Val Loss: 0.106 | Val Acc: 0.961
Epoch: 11
Train Loss: 0.112 | Train Acc: 0.958
Val Loss: 0.103 | Val Acc: 0.963
Epoch: 12
Train Loss: 0.108 | Train Acc: 0.960
Val Loss: 0.098 | Val Acc: 0.964
Epoch: 13
Train Loss: 0.104 | Train Acc: 0.962
Va

In [None]:
if "test_data" in globals():
    model.load_state_dict(torch.load('model.pt'))

    test_loss, test_data = evaluate(model, test_iterator, criterion, TAG_PAD_IDX)

    print(f"Test Loss: {test_loss:.3f} | Test Acc: {tes_acc:.3f}")