# Character-level Language Modeling

## Overview

In character-level language modeling tasks, each sequence is broken into elements by characters. Therefore, in a character-level model, at each time step the model is expected to predict the next coming character. We evaluate the temporal convolutional network as a character-level language model on the PeenTreebank dataset.

## Settings

In [12]:
import torch as th
import torch.nn as nn
import observations
import unidecode
from collections import Counter
import time
import math
from tqdm.notebook import tqdm
import torch.nn.functional as F

DATA_ROOT = "/home/densechen/dataset"
BATCH_SIZE = 32
DEVICE = "cuda:0"
DROPOUT = 0.1
EMB_DROPOUT = 0.1
CLIP = 0.15
EPOCHS = 10
KSIZE = 3
LEVELS = 3
LR = 4
OPTIM = "SGD"
NHID = 450
VALID_SEQ_LEN = 320
SEQ_LEN = 400
SEED = 1111

EMSIZE = 100

CHANNEL_SIZES = [NHID] * (LEVELS - 1) + [EMSIZE]

th.manual_seed(SEED)

<torch._C.Generator at 0x7f4e4c0479d0>

## Data Genration

### PennTreebank

When used as a character-level language corpus, PTB contains 5,059K characters for training, 396K for validation and 446K for testing, with an alphabet size of 50. PennTreebank is a well-studied (but relatively small) language dataset.

In [7]:
class Dictionary(object):
    def __init__(self):
        self.char2idx = {}
        self.idx2char = []
        self.counter = Counter()
    
    def add_word(self, char):
        self.counter[char] += 1
    
    def prep_dict(self):
        for char in self.counter:
            if char not in self.char2idx:
                self.idx2char.append(char)
                self.char2idx[char] = len(self.idx2char) - 1
    def __len__(self):
        return len(self.idx2char)

class Corpus(object):
    def __init__(self, string):
        self.dict = Dictionary()
        for c in string:
            self.dict.add_word(c)
        self.dict.prep_dict()

def date_generator():
    file, testfile, valfile = observations.ptb(DATA_ROOT)
    file_len, valfile_len, testfile_len = len(file), len(valfile), len(testfile)
    
    corpus = Corpus(file + " " + valfile + " " + testfile)
    
    return file, file_len, valfile, valfile_len, testfile, testfile_len, corpus

def char_tensor(corpus, string):
    tensor = th.zeros(len(string)).long()
    for i in range(len(string)):
        tensor[i] = corpus.dict.char2idx[string[i]]
    return tensor.to(DEVICE)

def batchify(data, batch_size):
    # the output has size [L x batch size], where L could be a long sequence length.
    # work out cleanly we can divide the dataset into batch size parts, i.e. continuous seqs.
    nbatch = len(data) // batch_size
    # trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # evently, divide the data across the batch size batches.
    data = data.view(batch_size, -1).to(DEVICE)
    
    return data

def get_batch(source, start_index):
    seq_len = min(SEQ_LEN, source.size(1)-1-start_index)
    end_index = start_index + seq_len
    inp = source[:, start_index:end_index].contiguous()
    target = source[:, start_index+1:end_index+1].contiguous()
    
    return inp, target

print("Producing data...")
file, file_len, valfile, valfile_len, testfile, testfile_len, corpus = date_generator()

n_characters = len(corpus.dict)
train_data = batchify(char_tensor(corpus, file), BATCH_SIZE)
val_data = batchify(char_tensor(corpus, valfile), 1)
test_data = batchify(char_tensor(corpus, testfile), 1)

print(f"Corpus size: {n_characters}")
print("Finished.")

Producing data...
Corpus size: 49
Finished.


## Build Model

In [9]:
from core.tcn import TemporalConvNet
class TCN(nn.Module):
    def __init__(self, input_size, output_size, num_channels, kernel_size=2, dropout=0.2, emb_dropout=0.2):
        super().__init__()
        self.encoder = nn.Embedding(output_size, input_size)
        self.tcn = TemporalConvNet(input_size, num_channels, kernel_size=kernel_size, dropout=dropout)
        self.decoder = nn.Linear(input_size, output_size)
        self.decoder.weight = self.encoder.weight
        self.drop = nn.Dropout(emb_dropout)
    
    def forward(self, x):
        # input has dimension (N, L_in), and emb has dimension (N, L_in, C_in).
        emb = self.drop(self.encoder(x))
        y = self.tcn(emb.transpose(1, 2))
        o = self.decoder(y.transpose(1, 2))
        return o.contiguous()

print("Building model...")

model = TCN(EMSIZE, n_characters, CHANNEL_SIZES, KSIZE, DROPOUT, EMB_DROPOUT)
model = model.to(DEVICE)

optimizer = getattr(th.optim, OPTIM)(model.parameters(), lr=LR)
print("Finished.")

Building model...
Finished.


## Run

In [16]:
def evaluate(source):
    model.eval()
    total_loss = 0
    source_len = source.size(1)
    count = 0
    with th.no_grad():
        for batch, i in enumerate(range(0, source_len - 1, VALID_SEQ_LEN)):
            if i + SEQ_LEN - VALID_SEQ_LEN >= source_len:
                continue
            inp, target = get_batch(source, i)
            output = model(inp)
            eff_history = SEQ_LEN - VALID_SEQ_LEN
            final_output = output[:, eff_history:].contiguous().view(-1, n_characters)
            final_target = target[:, eff_history:].contiguous().view(-1)
            loss = F.cross_entropy(final_output, final_target)

            total_loss += loss.data * final_output.size(0)
            count += final_output.size(0)

    val_loss = total_loss.item() / count * 1.0
    return val_loss


def train(ep):
    model.train()
    total_loss = 0
    source = train_data
    source_len = source.size(1)
    process = tqdm(range(0, source_len - 1, VALID_SEQ_LEN))
    for i in process:
        if i + SEQ_LEN - VALID_SEQ_LEN >= source_len:
            continue
        inp, target = get_batch(source, i)
        optimizer.zero_grad()
        output = model(inp)
        eff_history = SEQ_LEN - VALID_SEQ_LEN
        final_output = output[:, eff_history:].contiguous().view(-1, n_characters)
        final_target = target[:, eff_history:].contiguous().view(-1)
        loss = F.cross_entropy(final_output, final_target)
        loss.backward()

        if CLIP > 0:
            th.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()
        
        process.set_description(f"Train Epcoh: {ep}, loss: {loss.item():.4f}")

for epoch in range(1, EPOCHS + 1):
    train(epoch)

    vloss = evaluate(val_data)
    print('-' * 89)
    print(f'| End of epoch {epoch:3d} | valid loss {vloss:5.3f}')

    test_loss = evaluate(test_data)
    print('=' * 89)
    print(f'| End of epoch {epoch:3d} | test loss {test_loss:5.3f}')
    print('=' * 89)

  0%|          | 0/515 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| End of epoch   1 | valid loss 1.399
| End of epoch   1 | test loss 1.370


  0%|          | 0/515 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| End of epoch   2 | valid loss 1.245
| End of epoch   2 | test loss 1.215


  0%|          | 0/515 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| End of epoch   3 | valid loss 1.179
| End of epoch   3 | test loss 1.149


  0%|          | 0/515 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| End of epoch   4 | valid loss 1.140
| End of epoch   4 | test loss 1.110


  0%|          | 0/515 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| End of epoch   5 | valid loss 1.115
| End of epoch   5 | test loss 1.084


  0%|          | 0/515 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| End of epoch   6 | valid loss 1.091
| End of epoch   6 | test loss 1.061


  0%|          | 0/515 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| End of epoch   7 | valid loss 1.074
| End of epoch   7 | test loss 1.044


  0%|          | 0/515 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| End of epoch   8 | valid loss 1.064
| End of epoch   8 | test loss 1.033


  0%|          | 0/515 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| End of epoch   9 | valid loss 1.053
| End of epoch   9 | test loss 1.023


  0%|          | 0/515 [00:00<?, ?it/s]

-----------------------------------------------------------------------------------------
| End of epoch  10 | valid loss 1.043
| End of epoch  10 | test loss 1.014
