TCN原论文：An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling，https://arxiv.org/abs/1803.01271
官方实现地址：https://github.com/locuslab/TCN

In [2]:
!git clone https://github.com/HoratioJSY/tcn.git

fatal: destination path 'tcn' already exists and is not an empty directory.


In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
import os
import torch
from torch import nn
from torch.autograd import Variable
import pickle
from torch.nn.utils import weight_norm
import argparse
import time
import math
import torch.optim as optim

In [0]:
# utils.py，关于数据与词汇库等预定义的类与方法

"""
Note: The meaning of batch_size in PTB is different from that in MNIST example. In MNIST,
batch_size is the # of sample data that is considered in each iteration; in PTB, however,
it is the number of segments to speed up computation. 
The goal of PTB is to train a language model to predict the next word.
"""

def data_generator(data_path):
    corpus = Corpus(data_path)
    pickle.dump(corpus, open(data_path + '/corpus', 'wb'))
    return corpus

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

def batchify(data, batch_size, cuda):
    """The output should have size [L x batch_size], where L could be a long sequence length"""
    # Work out how cleanly we can divide the dataset into batch_size parts (i.e. continuous seqs).
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the batch_size batches.
    data = data.view(batch_size, -1)
    if cuda:
        data = data.cuda()
    return data


def get_batch(source, i, seq_len, seq_le=None, evaluation=False):
    seq_le = min(seq_le if seq_le else seq_len, source.size(1) - 1 - i)
    data = Variable(source[:, i:i+seq_le], volatile=evaluation)
    target = Variable(source[:, i+1:i+1+seq_le])     # CAUTION: This is un-flattened!
    return data, target

In [0]:
# tcn.py

# 定义实现因果卷积的类（继承自类nn.Module），其中super(Chomp1d, self).__init__()表示对继承自父类的属性进行初始化。
class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super(Chomp1d, self).__init__()
        self.chomp_size = chomp_size

    # 通过增加Padding的方式并对卷积后的张量做切片而实现因果卷积
    # tensor.contiguous()会返回有连续内存的相同张量
    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()


# 定义残差块，即两个一维卷积与恒等映射
class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
        super(TemporalBlock, self).__init__()

        # 定义第一个空洞卷积层
        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        # 根据第一个卷积层的输出与padding大小实现因果卷积
        self.chomp1 = Chomp1d(padding)
        # 添加激活函数与dropout正则化方法完成第一个卷积
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout2d(dropout)

        # 堆叠同样结构的第二个卷积层
        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        self.chomp2 = Chomp1d(padding)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout2d(dropout)

        # 将卷积模块的所有组建通过Sequential方法依次堆叠在一起
        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
                                 self.conv2, self.chomp2, self.relu2, self.dropout2)

        # padding保证了输入序列与输出序列的长度相等，但卷积前的通道数与卷积后的通道数不一定一样。
        # 如果通道数不一样，那么需要对输入x做一个逐元素的一维卷积以使得它的纬度与前面两个卷积相等。
        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()
        self.init_weights()

    # 初始化为从均值为0，标准差为0.01的正态分布中采样的随机值
    def init_weights(self):
        self.conv1.weight.data.normal_(0, 0.01)
        self.conv2.weight.data.normal_(0, 0.01)
        if self.downsample is not None:
            self.downsample.weight.data.normal_(0, 0.01)

    # 结合卷积与输入的恒等映射（或输入的逐元素卷积），并投入ReLU 激活函数完成残差模块
    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)


# 定义时间卷积网络的架构
class TemporalConvNet(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
        super(TemporalConvNet, self).__init__()
        layers = []

        # num_channels为各层卷积运算的输出通道数或卷积核数量，它的长度即需要执行的卷积层数量
        num_levels = len(num_channels)
        # 空洞卷积的扩张系数若随着网络层级的增加而成指数级增加，则可以增大感受野并不丢弃任何输入序列的元素
        # dilation_size根据层级数成指数增加，并从num_channels中抽取每一个残差模块的输入通道数与输出通道数
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_channels[i - 1]
            out_channels = num_channels[i]
            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
                                     padding=(kernel_size - 1) * dilation_size, dropout=dropout)]
        # 将所有残差模块堆叠起来组成一个深度卷积网络
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)



In [0]:
# model.py

class TCN(nn.Module):
    def __init__(self, input_size, output_size, num_channels,
                 kernel_size=2, dropout=0.3, emb_dropout=0.1, tied_weights=False):
        super(TCN, self).__init__()

        # 将一个批量的输入数据（one-hot encoding）送入编码器中成为一个批量的词嵌入向量
        # 其中output_size为词汇量，input_size为一个词向量的长度
        self.encoder = nn.Embedding(output_size, input_size)

        # 构建网络
        self.tcn = TemporalConvNet(input_size, num_channels, kernel_size, dropout=dropout)

        # 定义最后线性变换的纬度，即最后一个卷积层的通道数（类似2D卷积中的特征图数）到所有词汇的映射
        self.decoder = nn.Linear(num_channels[-1], output_size)

        # 是否共享编码器与解码器的权重，默认是共享。共享的话需要保持隐藏单元数等于词嵌入长度，这样预测的向量才可以视为词嵌入向量
        if tied_weights:
            if num_channels[-1] != input_size:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight
            print("Weight tied")

        # 对输入词嵌入执行Dropout 表示随机从句子中舍弃词，迫使模型不依赖于单个词完成任务
        self.drop = nn.Dropout(emb_dropout)
        self.emb_dropout = emb_dropout
        self.init_weights()

    def init_weights(self):
        self.encoder.weight.data.normal_(0, 0.01)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.normal_(0, 0.01)

    # 先编码，训练中再随机丢弃词，输入到网络实现推断，最后将推断结果解码为词
    def forward(self, input):
        """Input ought to have dimension (N, C_in, L_in), where L_in is the seq_len; here the input is (N, L, C)"""
        emb = self.drop(self.encoder(input))
        y = self.tcn(emb.transpose(1, 2)).transpose(1, 2)
        y = self.decoder(y)
        return y.contiguous()

In [8]:
#word_cnn_test.py

cuda = True # 是否使用 CUDA
data_path = './tcn/data/penn' # 数据集地址
batch_size = 16 # 批量大小
nhid = 600 # 每层隐藏单元数
levels = 4 # 残差模块数
emsize = 600 # 词嵌入长度
k_size = 3 # 卷积核大小
dropout = 0.45 # 应用到网络层级中的随机失活率
emb_dropout = 0.25 # 应用到嵌入层的随机失活率
tied = True # 是否绑定编码器与解码器的权重
lr = 4 # 初始学习率
optimization ='SGD'
validseqlen = 40 # 验证序列长度
seq_len = 80 # 总序列长度
log_interval = 100 # 记录日志的间隔
clip = 0.35 # 梯度截断，-1表示不采用梯度截断
epochs =100 # 训练轮数的上限

# Set the random seed manually for reproducibility.
torch.manual_seed(1111)
if torch.cuda.is_available():
    if not cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

corpus = data_generator(data_path)
eval_batch_size = 10
train_data = batchify(corpus.train, batch_size, cuda)
val_data = batchify(corpus.valid, eval_batch_size, cuda)
test_data = batchify(corpus.test, eval_batch_size, cuda)


n_words = len(corpus.dictionary)
num_chans = [nhid] * (levels - 1) + [emsize]

model = TCN(emsize, n_words, num_chans, dropout=dropout, emb_dropout=emb_dropout, kernel_size=k_size, tied_weights=tied)

if cuda:
    model.cuda()

# May use adaptive softmax to speed up training
criterion = nn.CrossEntropyLoss()


optimizer = getattr(optim, optimization)(model.parameters(), lr=lr)


def evaluate(data_source):
    model.eval()
    total_loss = 0
    processed_data_size = 0
    for i in range(0, data_source.size(1) - 1, validseqlen):
        if i + seq_len - validseqlen >= data_source.size(1) - 1:
            continue
        data, targets = get_batch(data_source, i, seq_len, evaluation=True)
        output = model(data)

        # Discard the effective history, just like in training
        eff_history = seq_len - validseqlen
        final_output = output[:, eff_history:].contiguous().view(-1, n_words)
        final_target = targets[:, eff_history:].contiguous().view(-1)

        loss = criterion(final_output, final_target)

        # Note that we don't add TAR loss here
        total_loss += (data.size(1) - eff_history) * loss.data
        processed_data_size += data.size(1) - eff_history
    return total_loss[0] / processed_data_size


def train():
    # Turn on training mode which enables dropout.
    global train_data
    model.train()
    total_loss = 0
    start_time = time.time()
    for batch_idx, i in enumerate(range(0, train_data.size(1) - 1, validseqlen)):
        if i + seq_len - validseqlen >= train_data.size(1) - 1:
            continue
        data, targets = get_batch(train_data, i, seq_len)
        optimizer.zero_grad()
        output = model(data)

        # Discard the effective history part
        eff_history = seq_len - validseqlen
        if eff_history < 0:
            raise ValueError("Valid sequence length must be smaller than sequence length!")
        final_target = targets[:, eff_history:].contiguous().view(-1)
        final_output = output[:, eff_history:].contiguous().view(-1, n_words)
        loss = criterion(final_output, final_target)

        loss.backward()
        if clip > 0:
            torch.nn.utils.clip_grad_norm(model.parameters(), clip)
        optimizer.step()

        total_loss += loss.data

        if batch_idx % log_interval == 0 and batch_idx > 0:
            cur_loss = total_loss[0] / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch_idx, train_data.size(1) // validseqlen, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


if __name__ == "__main__":
    best_vloss = 1e8

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        all_vloss = []
        for epoch in range(1, epochs+1):
            epoch_start_time = time.time()
            train()
            val_loss = evaluate(val_data)
            test_loss = evaluate(test_data)

            print('-' * 89)
            print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                               val_loss, math.exp(val_loss)))
            print('| end of epoch {:3d} | time: {:5.2f}s | test loss {:5.2f} | '
                  'test ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                            test_loss, math.exp(test_loss)))
            print('-' * 89)

            # Save the model if the validation loss is the best we've seen so far.
            if val_loss < best_vloss:
                with open("model.pt", 'wb') as f:
                    print('Save model!\n')
                    torch.save(model, f)
                best_vloss = val_loss

            # Anneal the learning rate if the validation loss plateaus
            if epoch > 5 and val_loss >= max(all_vloss[-5:]):
                lr = lr / 2.
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
            all_vloss.append(val_loss)

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

    # Load the best saved model.
    with open("model.pt", 'rb') as f:
        model = torch.load(f)

    # Run on test data.
    test_loss = evaluate(test_data)
    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    print('=' * 89)

Weight tied
| epoch   1 |   100/ 1452 batches | lr 4.00000 | ms/batch 118.81662 | loss  7.52 | ppl  1840.81
| epoch   1 |   200/ 1452 batches | lr 4.00000 | ms/batch 109.73478 | loss  6.81 | ppl   910.24
| epoch   1 |   300/ 1452 batches | lr 4.00000 | ms/batch 109.62409 | loss  6.58 | ppl   717.81
| epoch   1 |   400/ 1452 batches | lr 4.00000 | ms/batch 109.41673 | loss  6.37 | ppl   582.09
| epoch   1 |   500/ 1452 batches | lr 4.00000 | ms/batch 109.33626 | loss  6.23 | ppl   506.72
| epoch   1 |   600/ 1452 batches | lr 4.00000 | ms/batch 109.01555 | loss  6.21 | ppl   498.33
| epoch   1 |   700/ 1452 batches | lr 4.00000 | ms/batch 108.83427 | loss  6.12 | ppl   452.68
| epoch   1 |   800/ 1452 batches | lr 4.00000 | ms/batch 108.99423 | loss  6.01 | ppl   409.11
| epoch   1 |   900/ 1452 batches | lr 4.00000 | ms/batch 108.66103 | loss  5.98 | ppl   396.50
| epoch   1 |  1000/ 1452 batches | lr 4.00000 | ms/batch 108.71295 | loss  5.93 | ppl   376.19
| epoch   1 |  1100/ 1452 ba

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |   100/ 1452 batches | lr 4.00000 | ms/batch 109.80678 | loss  5.82 | ppl   336.42
| epoch   2 |   200/ 1452 batches | lr 4.00000 | ms/batch 108.55440 | loss  5.70 | ppl   298.02
| epoch   2 |   300/ 1452 batches | lr 4.00000 | ms/batch 108.44461 | loss  5.67 | ppl   290.36
| epoch   2 |   400/ 1452 batches | lr 4.00000 | ms/batch 108.52798 | loss  5.55 | ppl   258.41
| epoch   2 |   500/ 1452 batches | lr 4.00000 | ms/batch 108.56836 | loss  5.53 | ppl   251.28
| epoch   2 |   600/ 1452 batches | lr 4.00000 | ms/batch 108.47103 | loss  5.58 | ppl   265.04
| epoch   2 |   700/ 1452 batches | lr 4.00000 | ms/batch 108.35539 | loss  5.55 | ppl   257.78
| epoch   2 |   800/ 1452 batches | lr 4.00000 | ms/batch 108.45743 | loss  5.51 | ppl   246.27
| epoch   2 |   900/ 1452 batches | lr 4.00000 | ms/batch 108.37481 | loss  5.51 | ppl   246.76
| epoch   2 |  1000/ 1452 batches | lr 4.00000 | ms/batch 108.62212 | loss  5.48 | ppl   240.42
| epoch   2 |  1100/ 1452 batches | lr 4