In [1]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import torch.optim as optim
import random
import time
import json
import copy

from load_data import load_dataset

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Model structure

In [2]:
### KimCNN Model Structure
class KimCNN(nn.Module):
    def __init__(self, mode, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super(KimCNN, self).__init__()
        output_channel = n_filters
        target_class = output_dim
        words_num = vocab_size
        words_dim = embedding_dim
        embed_num = vocab_size
        embed_dim = embedding_dim
        self.mode = mode
        Ks = 3 # There are three conv net here
        if self.mode == 'multichannel':
            input_channel = 2
        else:
            input_channel = 1
        self.embed = nn.Embedding(words_num, words_dim)
        self.static_embed = nn.Embedding(embed_num, embed_dim)
        self.non_static_embed = nn.Embedding(embed_num, embed_dim)
        self.static_embed.weight.requires_grad = False

        self.conv1 = nn.Conv2d(input_channel, output_channel, (3, words_dim), padding=(2,0))
        self.conv2 = nn.Conv2d(input_channel, output_channel, (4, words_dim), padding=(3,0))
        self.conv3 = nn.Conv2d(input_channel, output_channel, (5, words_dim), padding=(4,0))

        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(Ks * output_channel, target_class)


    def forward(self, x):
        if self.mode == 'rand':
            word_input = self.embed(x) # (batch, sent_len, embed_dim)
            x = word_input.unsqueeze(1) # (batch, channel_input, sent_len, embed_dim)
        elif self.mode == 'static':
            static_input = self.static_embed(x)
            x = static_input.unsqueeze(1) # (batch, channel_input, sent_len, embed_dim)
        elif self.mode == 'non-static':
            non_static_input = self.non_static_embed(x)
            x = non_static_input.unsqueeze(1) # (batch, channel_input, sent_len, embed_dim)
        elif self.mode == 'multichannel':
            non_static_input = self.non_static_embed(x)
            static_input = self.static_embed(x)
            x = torch.stack([non_static_input, static_input], dim=1) # (batch, channel_input=2, sent_len, embed_dim)
        else:
            print("Unsupported Mode")
            exit()
        x = [F.relu(self.conv1(x)).squeeze(3), F.relu(self.conv2(x)).squeeze(3), F.relu(self.conv3(x)).squeeze(3)]
        # (batch, channel_output, ~=sent_len) * Ks
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # max-over-time pooling
        # (batch, channel_output) * Ks
        x = torch.cat(x, 1) # (batch, channel_output * Ks)
        x = self.dropout(x)
        logit = self.fc1(x) # (batch, target_size)
        return logit

### Utilities

In [3]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label.long())
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label.long())
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def multi_models(model, train_iterator, valid_iterator,  test_iterator, num = 5, N_EPOCHS = 20, early_stopping = 5):
    init_model = copy.deepcopy(model)
    res = {}
    res["test_loss"] = []
    res["test_acc"] = []
    ### begin training process
    for i in range(num):
        print('Begin training model %s'%i)
        res["model_%s"%i] = {}
        res["model_%s"%i]["time"] = []
        res["model_%s"%i]["train_loss"] = []
        res["model_%s"%i]["val_loss"] = []
        res["model_%s"%i]["train_acc"] = []
        res["model_%s"%i]["val_acc"] = []
        best_valid_loss = float('inf')
        num_steps = 0
        ### reset model for each round
        # model.load_state_dict(init_param)
        model = copy.deepcopy(init_model)
        model = model.to(device)
        optimizer = optim.Adam(model.parameters())
        criterion = nn.CrossEntropyLoss()
        criterion = criterion.to(device)
        
        for epoch in range(N_EPOCHS):
            print('Begin epoch %s'%epoch)
            start_time = time.time()
            train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
            valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
            end_time = time.time()
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(), 'best-model.pt')
                num_steps = 0
            else:
                num_steps += 1
            print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
            res["model_%s"%i]["time"].append(end_time - start_time)
            res["model_%s"%i]["train_loss"].append(train_loss)
            res["model_%s"%i]["val_loss"].append(valid_loss)
            res["model_%s"%i]["train_acc"].append(train_acc)
            res["model_%s"%i]["val_acc"].append(valid_acc)
            if num_steps >= early_stopping:
                break
        
        model.load_state_dict(torch.load('best-model.pt'))
        test_loss, test_acc = evaluate(model, test_iterator, criterion)
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
        res["test_loss"].append(test_loss)
        res["test_acc"].append(test_acc)
    return res

### Train model on IMDB dataset

In [5]:
data_name = "IMDB"
train_iterator, valid_iterator, test_iterator, TEXT, LABEL = load_dataset(data_name, device)

In [9]:
### Statis Model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
MODE = "static"

model = KimCNN(MODE, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
### pre-train embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.static_embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.static_embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.static_embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

res_kimcnn = multi_models(model, train_iterator, valid_iterator, test_iterator)
res_kimcnn["num_param"] = count_parameters(model)
with open("kimCNN_static_%s.json"%data_name, "w") as outfile:
    outfile.write(json.dumps(res_kimcnn, indent=4))

MODE = "non-static"
model = KimCNN(MODE, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
pretrained_embeddings = TEXT.vocab.vectors
model.static_embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.static_embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.static_embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

res_kimcnn = multi_models(model, train_iterator, valid_iterator, test_iterator)
res_kimcnn["num_param"] = count_parameters(model)
with open("kimCNN_nonstatic_%s.json"%data_name, "w") as outfile:
    outfile.write(json.dumps(res_kimcnn, indent=4))

MODE = "multichannel"
model = KimCNN(MODE, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
### pre-train embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.static_embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.static_embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.static_embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

res_kimcnn = multi_models(model, train_iterator, valid_iterator, test_iterator)
res_kimcnn["num_param"] = count_parameters(model)
with open("kimCNN_multichannel_%s.json"%data_name, "w") as outfile:
    outfile.write(json.dumps(res_kimcnn, indent=4))

### Train model on TREC dataset

In [12]:
data_name = "TREC"
train_iterator, valid_iterator, test_iterator, TEXT, LABEL = load_dataset(data_name, device)

### Statis Model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
MODE = "static"

model = KimCNN(MODE, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
### pre-train embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.static_embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.static_embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.static_embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

res_kimcnn = multi_models(model, train_iterator, valid_iterator, test_iterator)
res_kimcnn["num_param"] = count_parameters(model)
with open("kimCNN_static_%s.json"%data_name, "w") as outfile:
    outfile.write(json.dumps(res_kimcnn, indent=4))

MODE = "non-static"
model = KimCNN(MODE, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
pretrained_embeddings = TEXT.vocab.vectors
model.static_embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.static_embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.static_embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

res_kimcnn = multi_models(model, train_iterator, valid_iterator, test_iterator)
res_kimcnn["num_param"] = count_parameters(model)
with open("kimCNN_nonstatic_%s.json"%data_name, "w") as outfile:
    outfile.write(json.dumps(res_kimcnn, indent=4))

MODE = "multichannel"
model = KimCNN(MODE, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
### pre-train embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.static_embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.static_embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.static_embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

res_kimcnn = multi_models(model, train_iterator, valid_iterator, test_iterator)
res_kimcnn["num_param"] = count_parameters(model)
with open("kimCNN_multichannel_%s.json"%data_name, "w") as outfile:
    outfile.write(json.dumps(res_kimcnn, indent=4))

downloading train_5500.label


train_5500.label: 100%|██████████| 336k/336k [00:00<00:00, 5.99MB/s]


downloading TREC_10.label


TREC_10.label: 100%|██████████| 23.4k/23.4k [00:00<00:00, 1.99MB/s]


Begin training model 0
Begin epoch 0
	Train Loss: 1.048 | Train Acc: 62.69%
	 Val. Loss: 0.595 |  Val. Acc: 81.81%
Begin epoch 1
	Train Loss: 0.514 | Train Acc: 82.71%
	 Val. Loss: 0.428 |  Val. Acc: 85.46%
Begin epoch 2
	Train Loss: 0.353 | Train Acc: 88.57%
	 Val. Loss: 0.369 |  Val. Acc: 87.19%
Begin epoch 3
	Train Loss: 0.260 | Train Acc: 91.65%
	 Val. Loss: 0.344 |  Val. Acc: 87.19%
Begin epoch 4
	Train Loss: 0.195 | Train Acc: 94.28%
	 Val. Loss: 0.353 |  Val. Acc: 87.37%
Begin epoch 5
	Train Loss: 0.147 | Train Acc: 95.95%
	 Val. Loss: 0.320 |  Val. Acc: 89.43%
Begin epoch 6
	Train Loss: 0.106 | Train Acc: 97.40%
	 Val. Loss: 0.339 |  Val. Acc: 88.06%
Begin epoch 7
	Train Loss: 0.087 | Train Acc: 97.90%
	 Val. Loss: 0.327 |  Val. Acc: 89.44%
Begin epoch 8
	Train Loss: 0.073 | Train Acc: 98.56%
	 Val. Loss: 0.329 |  Val. Acc: 88.58%
Begin epoch 9
	Train Loss: 0.061 | Train Acc: 98.57%
	 Val. Loss: 0.331 |  Val. Acc: 88.07%
Begin epoch 10
	Train Loss: 0.048 | Train Acc: 99.10%
	 V

### Train model on SST

In [13]:
data_name = "SST"
train_iterator, valid_iterator, test_iterator, TEXT, LABEL = load_dataset(data_name, device)

### Statis Model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
MODE = "static"

model = KimCNN(MODE, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
### pre-train embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.static_embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.static_embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.static_embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

res_kimcnn = multi_models(model, train_iterator, valid_iterator, test_iterator)
res_kimcnn["num_param"] = count_parameters(model)
with open("kimCNN_static_%s.json"%data_name, "w") as outfile:
    outfile.write(json.dumps(res_kimcnn, indent=4))

MODE = "non-static"
model = KimCNN(MODE, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
pretrained_embeddings = TEXT.vocab.vectors
model.static_embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.static_embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.static_embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

res_kimcnn = multi_models(model, train_iterator, valid_iterator, test_iterator)
res_kimcnn["num_param"] = count_parameters(model)
with open("kimCNN_nonstatic_%s.json"%data_name, "w") as outfile:
    outfile.write(json.dumps(res_kimcnn, indent=4))

MODE = "multichannel"
model = KimCNN(MODE, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
### pre-train embeddings
pretrained_embeddings = TEXT.vocab.vectors
model.static_embed.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.static_embed.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.static_embed.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

res_kimcnn = multi_models(model, train_iterator, valid_iterator, test_iterator)
res_kimcnn["num_param"] = count_parameters(model)
with open("kimCNN_multichannel_%s.json"%data_name, "w") as outfile:
    outfile.write(json.dumps(res_kimcnn, indent=4))

downloading trainDevTestTrees_PTB.zip


trainDevTestTrees_PTB.zip: 100%|██████████| 790k/790k [00:00<00:00, 1.26MB/s]


extracting
Begin training model 0
Begin epoch 0
	Train Loss: 0.283 | Train Acc: 88.47%
	 Val. Loss: 0.408 |  Val. Acc: 81.14%
Begin epoch 1
	Train Loss: 0.215 | Train Acc: 91.76%
	 Val. Loss: 0.410 |  Val. Acc: 81.90%
Begin epoch 2
	Train Loss: 0.184 | Train Acc: 93.07%
	 Val. Loss: 0.476 |  Val. Acc: 81.34%
Begin epoch 3
	Train Loss: 0.162 | Train Acc: 94.00%
	 Val. Loss: 0.474 |  Val. Acc: 81.29%
Begin epoch 4
	Train Loss: 0.150 | Train Acc: 94.55%
	 Val. Loss: 0.485 |  Val. Acc: 81.72%
Begin epoch 5
	Train Loss: 0.140 | Train Acc: 95.03%
	 Val. Loss: 0.480 |  Val. Acc: 82.61%
Test Loss: 0.385 | Test Acc: 82.62%
Begin training model 1
Begin epoch 0
	Train Loss: 0.286 | Train Acc: 88.31%
	 Val. Loss: 0.407 |  Val. Acc: 81.85%
Begin epoch 1
	Train Loss: 0.214 | Train Acc: 91.73%
	 Val. Loss: 0.410 |  Val. Acc: 83.91%
Begin epoch 2
	Train Loss: 0.183 | Train Acc: 93.10%
	 Val. Loss: 0.453 |  Val. Acc: 80.78%
Begin epoch 3
	Train Loss: 0.162 | Train Acc: 94.07%
	 Val. Loss: 0.455 |  Val.