In [33]:
import torch 
import nltk
import pdb
import numpy as np
import torch.nn as nn 
import torch.optim as optim

from time import time
from sklearn import metrics
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import sent_tokenize, word_tokenize
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

nltk.download('wordnet')
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
%pdb off

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Automatic pdb calling has been turned OFF


## Global Argument

In [0]:
class Argument():
    def __init__(self):
        self.batch_size = 256
        self.embedding_size = 300
        self.hidden_size = 512
        self.n_class = 16
        self.dropout = 0.5
        self.filter_num = 100
        self.layer_num = 1
        self.lr = 1e-3
        self.wd = 1e-4
        self.n_epochs = 10
        self.iter_interval = 300
        self.metric_list = ['accuracy']

args = Argument()

## Download the dataset

In [1]:
%%sh
wget http://phontron.com/data/topicclass-v1.tar.gz
tar xvzf topicclass-v1.tar.gz topicclass
wget http://nlp.stanford.edu/data/glove.42B.300d.zip
unzip glove.42B.300d.zip
wc -l glove.42B.300d.txt

glove.42B.300d.txt
glove.42B.300d.zip
sample_data
topicclass
topicclass-v1.tar.gz
topicclass-v1.tar.gz.1
topicclass-v1.tar.gz.2


## Data Preprocessing

In [0]:
class PretrainedEmbedding():
    def __init__(self, embedding_size):
        self.embedding_dict = {}
        self.embedding_size = embedding_size
    
    def load(self, embedding_file, existing_tokens):
        start_time = time()
        existing_tokens = set(existing_tokens)
        print("Loading embedding from file...")

        with open(embedding_file, 'r') as f:
            for line in f:
                line_split = line.split()
                token, vec = line_split[0], line_split[1:]

                if token not in existing_tokens:
                    continue

                vec = list(map(lambda x: float(x), vec))
                assert token not in self.embedding_dict
                self.embedding_dict[token] = vec

        print("Loading finished in {:.1f}s".format(time() - start_time))

    def get_embedding_words(self):
        return list(self.embedding_dict.keys())
    
    def get_embedding_size(self):
        return self.embedding_size

    def get_embedding_weight_matrix(self, tokens_ls):
        weight_matrix = np.zeros((len(tokens_ls), self.embedding_size))
        for i, token in enumerate(tokens_ls):
            try:
                weight_matrix[i] = self.embedding_dict[token]
            except KeyError:
                weight_matrix[i] = np.random.normal(scale=0.6, 
                                                     size=(self.embedding_size, ))
        return weight_matrix

    def __len__(self):
        return len(self.embedding_dict)


class Vocabulary():
    def __init__(self):
        self.token2index = {'<UNK>': 0, '<PAD>': 1}
        self.token_ls = ['<UNK>', '<PAD>']
        self.token_cnt = 2

    def add_token(self, token):
        if token not in self.token2index:
            self.token2index[token] = self.token_cnt
            self.token_ls.append(token)
            self.token_cnt += 1
        assert self.token_cnt == len(self.token_ls) == len(self.token2index)

    def add_token_ls(self, token_ls):
        for token in token_ls:
            self.add_token(token)

    def get_token_ls(self):
        return self.token_ls

    def get_token2index(self):
        return self.token2index

    def get_vocab_size(self):
        return self.token_cnt
    
    def __str__(self):
        return "Vocabulary of {} tokens".format(self.token_cnt)

"""
The main class that will be used to process the data for topic classification.
"""
class DataProcessor():
    def __init__(self, lemmatizer):
        self.vocab = Vocabulary()
        self.lemmatizer = lemmatizer
        self.label2index = {}

    """ 
    Parse the input txt file and return a processed format

    Args:
        file_name: the name of the input file that need to be parsed
        mode: "train" / "validation" / "test"
              "train": parsing the training data, build vocabulary and 
                       label mapping on the fly.
              "validation": parsing the validation data.
              "test": parsing the test data.
    Return:
            (label_ls, tokens_ls)
            label_ls: A list of topic labels in index format. 
                      [topic_idx_1, topic_idx_2, ...]
            tokens_id_ls: A list of tokens that has been converted to index
                          based on the vocabulary build on training set.
                          [[t11, t12, t13, ...], [t21, t22, ...], ...]
    """
    def process(self, file_name, mode):
        start_time = time()
        f_read = open(file_name, 'r')
        success_line_cnt = 0
        fail_line_cnt = 0
        topic_cnt = 0
        label_ls = []
        tokens_id_ls = []

        for line in f_read:
            try:
                assert len(line.split('|||')) == 2
                success_line_cnt += 1
            except:
                fail_line_cnt += 1
                continue

            splited_line = line.split('|||')
            topic= splited_line[0].lower().strip()
            text = splited_line[1].lower().strip()

            token_ls = self.tokenize(text)

            if mode == "train":
                # update label-to-index mapping
                if topic not in self.label2index:
                    self.label2index[topic] = topic_cnt
                    topic_cnt += 1

                # update vocabulary
                self.vocab.add_token_ls(token_ls)

            # append tokens and label in a index version
            tokens_id_ls.append(self.get_tokens_id(token_ls))
            if mode != "test":
                label_ls.append(self.get_label_idx(topic))

        print("Data processing on {} data finished in {:.1f}s. "
              "[{} line sucessful, {} line failed]".format(
                mode, time() - start_time, success_line_cnt, fail_line_cnt))

        return label_ls, tokens_id_ls


    def get_label_idx(self, topic):
        try:
            label_idx = self.label2index[topic]
        except:
            if topic == "media and darama":
                label_idx = self.label2index["media and drama"]
            else:
                raise ValueError("the topic {} is not defined "
                                 "in the training set".format(topic))

        return label_idx

    def get_tokens_id(self, token_ls):
        tokens_id = []
        for token in token_ls:
            try:
                tokens_id.append(self.vocab.get_token2index()[token])
            except:
                tokens_id.append(self.vocab.get_token2index()['<UNK>'])

        assert len(tokens_id) == len(token_ls)
        return tokens_id

    def tokenize(self, text):
        token_ls = text.split()
        token_ls = list(map(self.tokenize_function, token_ls))
        return token_ls

    def tokenize_function(self, token):
        token = token.strip()
        if self.lemmatizer is not None:
            token = self.lemmatizer.lemmatize(token)
            token = self.lemmatizer.lemmatize(token, 'v')
            token = token.strip()
        return token

    def get_vocab(self):
        return self.vocab

    def get_label2index(self):
        return self.label2index
    
    def get_label_ls(self):
        label_ls = [_ for i in range(len(self.label2index))]
        for label, index in self.label2index.items():
            label_ls[index] = label 
        return label_ls


In [64]:
data_processor = DataProcessor(WordNetLemmatizer())
train_label_ls, train_tokens_id_ls = data_processor.process(
    'topicclass/topicclass_train.txt', 'train')
valid_label_ls, valid_tokens_id_ls = data_processor.process(
    'topicclass/topicclass_valid.txt', 'validation')
test_label_ls, test_tokens_id_ls = data_processor.process(
    'topicclass/topicclass_test.txt', 'test')
vocabulary = data_processor.get_vocab()

word_embedding = PretrainedEmbedding(args.embedding_size)
word_embedding.load('glove.42B.300d.txt', vocabulary.get_token_ls())
weight_matrix = word_embedding.get_embedding_weight_matrix(vocabulary.get_token_ls())

print("Overall vocabulary size is {}".format(vocabulary.get_vocab_size()))
print("Overall topic number is {}".format(len(data_processor.get_label2index())))
print("{} of {} ({:.0f}%) tokens has pretrained embeddings".format(
    len(word_embedding.get_embedding_words()), vocabulary.get_vocab_size(),
    len(word_embedding.get_embedding_words()) / vocabulary.get_vocab_size() * 100
))

Data processing on train data finished in 49.7s. [253909 line sucessful, 0 line failed]
Data processing on validation data finished in 0.1s. [643 line sucessful, 0 line failed]
Data processing on test data finished in 0.1s. [697 line sucessful, 0 line failed]
Overall vocabulary size is 104817
Overall topic number is 16
85031 of 104817 (81%) tokens has pretrained embeddings


## Initialize PyTorch Dataset and Dataloaders

In [0]:
class TopicTextData(Dataset):
    """
    The dataset for the topic classification.

    Args:
        tokens_id_ls: A list of tokens that has been converted to index
                        based on the vocabulary build on training set.
                        [[t11, t12, t13, ...], [t21, t22, ...], ...]
        label_ls: A list of topic labels in index format. 
                    [topic_idx_1, topic_idx_2, ...]
    """
    def __init__(self, tokens_id_ls, label_ls):
        self.tokens_id_ls = tokens_id_ls
        self.label_ls = label_ls
    
    def __len__(self):
        return len(self.tokens_id_ls)

    def __getitem__(self, index):
        tokens_id_tensor = torch.LongTensor(self.tokens_id_ls[index])

        if self.label_ls is None or len(self.label_ls) == 0:
            return tokens_id_tensor, 0

        label = self.label_ls[index]
        return tokens_id_tensor, label

def collate_with_padding(batch):
    tokens_id_tensor_ls = [item[0] for item in batch]
    label_ls = [item[1] for item in batch]

    tokens_id_tensor_padded = (pad_sequence(tokens_id_tensor_ls, 
                                            padding_value=1)
                              .transpose(1, 0)) # (B, max_sent_len)
    label_tensor = torch.LongTensor(label_ls)

    return tokens_id_tensor_padded, label_tensor


In [0]:
train_loader_params = dict(shuffle=True, 
                           batch_size=args.batch_size,
                           num_workers=4,
                           pin_memory=True,
                           drop_last=False,
                           collate_fn=collate_with_padding)

valid_loader_params = dict(shuffle=False, 
                           batch_size=args.batch_size,
                           num_workers=4,
                           pin_memory=True,
                           drop_last=False,
                           collate_fn=collate_with_padding)

train_data = TopicTextData(train_tokens_id_ls, train_label_ls)
valid_data = TopicTextData(valid_tokens_id_ls, valid_label_ls)
test_data = TopicTextData(test_tokens_id_ls, test_label_ls)
train_dataloader = DataLoader(train_data, **train_loader_params)
valid_dataloader = DataLoader(valid_data, **valid_loader_params)
test_dataloader = DataLoader(test_data, **valid_loader_params)


## Define Models

In [0]:
class CNNBlock(nn.Module):
    """The base CNN block for text classificaton.

    It contains a 1-D convolution over time, as well as 
    max-pooling over time

    Input:
        token_embeddings (Tensor): B * E * T embeddings of input sentence.
    Output:
        conv_features (Tensor): B * FILTER_NUM convoluted features.
    """
    def __init__(self, embedding_size, filter_size, filter_num, layer_num):
        super(CNNBlock, self).__init__()
        self.conv_layer = self.construct_conv_layer(embedding_size, 
                                                    filter_size, 
                                                    filter_num, 
                                                    layer_num)

    def forward(self, token_embeddings):
        conv_feats = self.conv_layer(token_embeddings)  # (B, E, T) -> (B, K, T')
        maxpool_feats, _ = torch.max(conv_feats, dim=2)  # (B, K, T') -> (B, K)
        return maxpool_feats

    def construct_conv_layer(self, embedding_size, filter_size, 
                             filter_num, layer_num):
        if layer_num < 2:
            return  nn.Conv1d(in_channels=embedding_size,
                              out_channels=filter_num,
                              kernel_size=filter_size)
        else:
            module_ls = []
            module_ls.append(
                nn.Conv1d(in_channels=embedding_size,
                          out_channels=filter_num,
                          kernel_size=filter_size)
            )
            for i in range(layer_num - 1):
                module_ls.append(
                    nn.Conv1d(in_channels=filter_num,
                              out_channels=filter_num,
                              kernel_size=filter_size)
                )

            return nn.Sequential(*module_ls)


class DeepCNN(nn.Module):
    def __init__(self, args, voc, embedding_weight_matrix=None):
        super(DeepCNN, self).__init__()

        self.embedding = nn.Embedding(num_embeddings=voc.get_vocab_size(),
                                      embedding_dim=args.embedding_size)
        if embedding_weight_matrix is not None:
            embedding_weight_matrix = torch.Tensor(embedding_weight_matrix)
            self.embedding.load_state_dict({'weight': embedding_weight_matrix})
            self.embedding.weight.requires_grad = False

        self.CNN_filter_3 = CNNBlock(args.embedding_size, 3, args.filter_num, 
                                     args.layer_num)
        self.CNN_filter_4 = CNNBlock(args.embedding_size, 4, args.filter_num, 
                                     args.layer_num)
        self.CNN_filter_5 = CNNBlock(args.embedding_size, 5, args.filter_num,
                                     args.layer_num)
        self.linear = nn.Sequential(
            nn.BatchNorm1d(args.filter_num * 3),
            nn.Dropout(args.dropout),
            nn.Linear(args.filter_num * 3, args.hidden_size),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(args.hidden_size),
            nn.Dropout(args.dropout),
            nn.Linear(args.hidden_size, args.n_class),
        )

    def forward(self, tokens_id):
        tokens_embedding = self.embedding(tokens_id).permute(0, 2, 1) # (B, E, T)
        conv_feats_filter_size_3 = self.CNN_filter_3(tokens_embedding) # (B, K)
        conv_feats_filter_size_4 = self.CNN_filter_4(tokens_embedding) # (B, K)
        conv_feats_filter_size_5 = self.CNN_filter_5(tokens_embedding) # (B, K)
        conv_feats_concat = torch.cat([conv_feats_filter_size_3,
                                       conv_feats_filter_size_4,
                                       conv_feats_filter_size_5], dim=1)        
        pred_result = self.linear(conv_feats_concat)

        return pred_result
        

In [65]:
model = DeepCNN(args, vocabulary, weight_matrix).to(device)
model

DeepCNN(
  (embedding): Embedding(104817, 300)
  (CNN_filter_3): CNNBlock(
    (conv_layer): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
  )
  (CNN_filter_4): CNNBlock(
    (conv_layer): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
  )
  (CNN_filter_5): CNNBlock(
    (conv_layer): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (linear): Sequential(
    (0): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Dropout(p=0.5, inplace=False)
    (2): Linear(in_features=300, out_features=512, bias=True)
    (3): ReLU(inplace=True)
    (4): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=512, out_features=16, bias=True)
  )
)

## Optimizer and Loss

In [0]:
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
criterion = nn.CrossEntropyLoss()

## Model Training and evaluation

In [0]:
class Evaluator():
    def evaluate(self, y_true, y_pred_proba, metric_list):
        y_pred = np.argmax(y_pred_proba, -1)
        output = {}
        
        if 'accuracy' in metric_list:
            output['accuracy'] = metrics.accuracy_score(y_true, y_pred)
        
        if 'micro_f1' in metric_list:
            output['micro-f1'] = metrics.f1_score(y_true, y_pred, 
                                                  average='micro')

        if 'macro_f1' in metric_list:
            output['macro-f1'] = metrics.f1_score(y_true, y_pred, 
                                                  average='macro')

        if 'confusion_matrix' in metric_list:
            output['confusion_matrix'] = str(metrics.confusion_matrix(
                    y_true, y_pred))
        return output

    def get_metric_str(self, metric_dict):
        info_str = ""
        for metric, value in metric_dict.items():
            info_str += "{}: {:.3f}, ".format(metric, value)
        return info_str[:-2]

class ModelRunner():
    def __init__(self, args, model, optim, criterion, device, lr_scheduler):
        self.n_epochs = args.n_epochs
        self.metric_list = args.metric_list
        self.iter_interval = args.iter_interval
        self.device = device
        self.model = model.to(device)
        self.optimizer = optim
        self.criterion = criterion
        self.evaluator = Evaluator()
        self.time = time()
        self.running_loss = 0
        self.lr_scheduler = lr_scheduler

    def train(self, train_dataloader, valid_dataloader=None):
        self.time = time()
        for epoch in range(self.n_epochs):
            self.train_epoch(epoch, train_dataloader)
            val_loss = self.print_valid_process(valid_dataloader)
            self.lr_scheduler.step(val_loss)
    
    def print_valid_process(self, valid_dataloader):
        if valid_dataloader is None:
            return 

        self.model.eval()
        all_true_labels = []
        all_pred_labels = []
        val_loss, n_samples = 0, 0

        with torch.no_grad():
            for feat, label in valid_dataloader:
                batch_size = len(label)
                feat, label = feat.to(self.device), label.to(self.device)
                pred_topic_proba = self.model(feat)  # (B, n_class)
                loss = self.criterion(pred_topic_proba, label)
                
                val_loss += loss.item() * batch_size
                n_samples += batch_size
                all_true_labels.extend(label.cpu().tolist())
                all_pred_labels.extend(pred_topic_proba.cpu().tolist())
        
        valid_metrics = self.evaluator.evaluate(np.array(all_true_labels),
                                                np.array(all_pred_labels),
                                                self.metric_list)
        metric_str = self.evaluator.get_metric_str(valid_metrics)
        print("[Evaluation] loss: {:.3f} {}".format(
            val_loss / n_samples, metric_str
        ))

        return val_loss


    def predict(self, input_data):
        self.model.eval() 
        input_data = input_data.to(self.device)

        with torch.no_grad():
            pred_topic_proba = self.model(input_data)
        return np.argmax(pred_topic_proba.cpu().numpy(), axis=1)

    def train_epoch(self, epoch, train_dataloader):
        self.model.train()
        num_iter_per_epoch = len(train_dataloader)
        all_labels, all_predictions = [], []
        self.running_loss = 0
        n_samples = 0

        for idx, (feat, label) in enumerate(train_dataloader):
            self.optimizer.zero_grad()

            batch_size = len(label)
            feat, label = feat.to(self.device), label.to(self.device)
            pred_topic_proba = self.model(feat)  # (B, n_class)
            pred_topics = torch.argmax(pred_topic_proba, dim=1)

            loss = self.criterion(pred_topic_proba, label)
            loss.backward()
            self.optimizer.step()

            self.running_loss += loss.item() * batch_size
            n_samples += batch_size

            all_labels.extend(label.cpu().tolist())
            all_predictions.extend(pred_topic_proba.cpu().tolist())

            self.print_train_process(epoch, idx, num_iter_per_epoch, 
                                     n_samples, all_labels, all_predictions)

    def print_train_process(self, epoch, idx, num_iter_per_epoch, n_samples,
                            all_labels, all_predictions):
        if (idx + 1) % self.iter_interval == 0:
            train_metrics = self.evaluator.evaluate(np.array(all_labels),
                                                    np.array(all_predictions),
                                                    self.metric_list)
            metric_str = self.evaluator.get_metric_str(train_metrics)
            train_info = "Epoch: {}/{}, iter: {}/{}, loss: {:.3f}, "\
                         "{} [{:.1f}s]".format(epoch + 1, self.n_epochs, 
                                              idx + 1, num_iter_per_epoch, 
                                              self.running_loss / n_samples, 
                                              metric_str, time() - self.time)
            print(train_info)
            self.time = time()

    def get_prediction(self, dataloader, label_ls):
        self.model.eval() 
        pred_id_ls = []
        pred_topic_ls = []

        with torch.no_grad():
            for feat, _ in dataloader:
                feat = feat.to(device)
                pred_topic_proba = self.model(feat)
                pred_id_ls.extend(
                    np.argmax(pred_topic_proba.cpu().numpy(), axis=1).tolist())
        
        for topic_id in pred_id_ls:
            pred_topic_ls.append(label_ls[topic_id])
        
        return pred_topic_ls


In [71]:
model_runner = ModelRunner(args, model, optimizer, criterion, device,
                           ReduceLROnPlateau(optimizer, 
                                             mode='min', 
                                             patience=1,
                                             factor=0.2))
model_runner.train(train_dataloader, valid_dataloader)

Epoch: 1/10, iter: 300/992, loss: 0.613, accuracy: 0.799 [4.0s]
Epoch: 1/10, iter: 600/992, loss: 0.614, accuracy: 0.799 [3.6s]
Epoch: 1/10, iter: 900/992, loss: 0.617, accuracy: 0.798 [4.4s]
[Evaluation] loss: 0.563 accuracy: 0.829
Epoch: 2/10, iter: 300/992, loss: 0.600, accuracy: 0.803 [5.8s]
Epoch: 2/10, iter: 600/992, loss: 0.606, accuracy: 0.802 [3.6s]
Epoch: 2/10, iter: 900/992, loss: 0.605, accuracy: 0.802 [3.7s]
[Evaluation] loss: 0.561 accuracy: 0.827
Epoch: 3/10, iter: 300/992, loss: 0.585, accuracy: 0.806 [6.8s]
Epoch: 3/10, iter: 600/992, loss: 0.590, accuracy: 0.806 [3.6s]
Epoch: 3/10, iter: 900/992, loss: 0.594, accuracy: 0.804 [3.6s]
[Evaluation] loss: 0.569 accuracy: 0.827
Epoch: 4/10, iter: 300/992, loss: 0.573, accuracy: 0.811 [6.5s]
Epoch: 4/10, iter: 600/992, loss: 0.576, accuracy: 0.811 [3.5s]
Epoch: 4/10, iter: 900/992, loss: 0.583, accuracy: 0.809 [3.6s]
[Evaluation] loss: 0.568 accuracy: 0.824
Epoch: 5/10, iter: 300/992, loss: 0.557, accuracy: 0.816 [6.6s]
Epoc

## Write to file

In [0]:
valid_pred_topic_ls = model_runner.get_prediction(valid_dataloader, data_processor.get_label_ls())
test_pred_topic_ls = model_runner.get_prediction(test_dataloader, data_processor.get_label_ls())

with open('dev_results.txt', 'w') as f:
    for topic in valid_pred_topic_ls:
        f.write(topic + '\n')

with open('test_results.txt', 'w') as f:
    for topic in test_pred_topic_ls:
        f.write(topic + '\n')
