In [1]:
import copy
import math
from torchtext import data, datasets
import string
import nltk
from nltk.corpus import stopwords
import torch
from torch import nn
import numpy as np
import pandas as pd

# Preprocessing

In [2]:
def tokenizer(text):
    tokens = []
    for word in nltk.word_tokenize(text):
        word = word.lower()
        if not word.isnumeric():
            tokens.append(word)
    return tokens


In [3]:
ID = data.Field(sequential=False, use_vocab=False)
TEXT = data.Field(sequential=True, tokenize=tokenizer, include_lengths=True, batch_first=True, fix_length=144, eos_token='eos')
LABEL = data.Field(sequential=False, use_vocab=False)

train, test_set = data.TabularDataset.splits(path='./', train='train.csv', test='test.csv',
                                        format='csv', fields=[('id', ID), ('text', TEXT), ('label', LABEL)],
                                         skip_header=True, csv_reader_params={"delimiter":','})
TEXT.build_vocab(train, vectors="glove.840B.300d")

In [4]:
# torchtext will initialize <pad> <eos> and <unk> vectors to zeros
# use different random vectors to differentiate them
origin_vocab_vec = TEXT.vocab
for i in range(3):
    origin_vocab_vec.vectors[i] = torch.rand(300)

In [22]:
import csv
def get_doc(file_path, stoi, keep_sents=20, sent_length=40):
    with open(file_path, "r") as f:
        tr = csv.reader(f, delimiter=',')
        docs = []
        sents_len = []
        docs_label = []
        docs_len = [] 
        ids = []
        for i, row in enumerate(tr):
            if i == 0: continue
            doc = []
            sent_len = []
            docs_label.append(int(row[2]))
            sents = row[1].split('.')
            for j, s in enumerate(sents):
                if j >= keep_sents: break
                s = s.split()
                if len(s) == 0:
                    continue
                sent = []
                for k, w in enumerate(s):
                    if k >= sent_length: break
                    sent.append(stoi.get(w.lower(), stoi['<pad>']))
                sent_len.append(len(sent))
                while len(sent) < sent_length: sent.append(stoi['<pad>'])
                doc.append(sent)
            docs_len.append(len(doc))
            while len(doc) < keep_sents:
                doc.append([stoi['<pad>'] for _ in range(sent_length)])
                sent_len.append(1)
            docs.append(doc)
            sents_len.append(sent_len)
            ids.append(int(row[0]))
    res = [docs, docs_label, sents_len, docs_len, ids]
    for i in range(len(res)):
        res[i] = torch.cuda.LongTensor(res[i])
    return tuple(res)
train, train_y, train_s_len, train_d_len, train_id = get_doc("train.csv", TEXT.vocab.stoi, 30, 30)
test, test_y, test_s_len, test_d_len, test_id = get_doc("test.csv", TEXT.vocab.stoi, 30, 30)


# Helper Functions

In [8]:
class torchIter:
    def __init__(self, batchsize, doc, doc_labels, doc_len, sent_len, ids):
        self.samples = doc.shape[0]
        self.batchsize = batchsize
        self.ctimes = (self.samples + batchsize - 1) // batchsize
        self.params = (doc, doc_labels, doc_len, sent_len, ids)
        
    def __iter__(self):
        self.ctr = 0
        return self
    
    def __next__(self):
        if self.ctr >= self.ctimes:
            raise StopIteration
        st = self.batchsize * self.ctr
        ed = st + self.batchsize
        self.ctr += 1
        return tuple(p[st:ed] for p in self.params)

In [9]:
def getSortedOrder(lens):
    sortedLen, fwdOrder = torch.sort(
        lens.contiguous().view(-1), dim=0, descending=True)
    _, bwdOrder = torch.sort(fwdOrder)
    sortedLen = sortedLen.cpu().numpy().tolist()
    return sortedLen, fwdOrder, bwdOrder

def dynamicRNN(rnnModel,
               seqInput,
               seqLens):
    '''
    Inputs:
        rnnModel     : Any torch.nn RNN model
        seqInput     : (batchSize, maxSequenceLength, embedSize)
                        Input sequence tensor (padded) for RNN model
        seqLens      : batchSize length torch.LongTensor or numpy array
        initialState : Initial (hidden, cell) states of RNN

    Output:
        A single tensor of shape (batchSize, rnnHiddenSize) corresponding
        to the outputs of the RNN model at the last time step of each input
        sequence. If returnStates is True, also return a tuple of hidden
        and cell states at every layer of size (num_layers, batchSize,
        rnnHiddenSize)
    '''
    sortedLen, fwdOrder, bwdOrder = getSortedOrder(seqLens)
    sortedSeqInput = seqInput.index_select(dim=0, index=fwdOrder)
    packedSeqInput = nn.utils.rnn.pack_padded_sequence(
        sortedSeqInput, lengths=sortedLen, batch_first=True)

    output, _ = rnnModel(packedSeqInput)
    output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)

    rnn_output = output.index_select(dim=0, index=bwdOrder)
    return rnn_output

# Model

In [10]:
class Model(nn.Module):
    def __init__(self, vocab, dropout, hidden_size, layer1, device=torch.device('cpu')):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding_size = 300
        self.embedding = nn.Embedding.from_pretrained(vocab.vectors, freeze=True)
        self.rnn = nn.GRU(input_size=self.embedding_size, hidden_size=self.hidden_size, batch_first=True, bidirectional=True)
        self.drnn = nn.GRU(input_size=self.hidden_size * 2, hidden_size=self.hidden_size, batch_first=True, bidirectional=True)
        self.h_u = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.h_u2 = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.u_w = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(1, 1, self.hidden_size)))
        self.u_d = nn.Parameter(nn.init.xavier_normal_(torch.Tensor(1, 1, self.hidden_size)))
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(-1)
        self.m_hid = nn.Linear(self.hidden_size * 2, layer1)
        self.score = nn.Linear(layer1, 5)
        self.dropout = nn.Dropout(p=dropout)
    def forward(self, data, doc_len, sent_len):
        bs, dc, st = data.shape
        data = self.embedding(data)
        data = self.dropout(data)
        data = data.view(bs*dc, st, -1)
        data_len = sent_len.view(bs*dc)
        output = dynamicRNN(self.rnn, data, data_len)
        u = self.tanh(self.h_u(output))
        alpha = self.softmax(torch.sum(self.u_w * u, -1))
        s = torch.sum(alpha.unsqueeze(-1) * output, 1)
        s = s.view(bs, dc, -1)
        output = dynamicRNN(self.drnn, s, doc_len)
        u2 = self.tanh(self.h_u2(output))
        alpha2 = self.softmax(torch.sum(self.u_d * u2, -1))
        d = torch.sum(alpha2.unsqueeze(-1) * output, 1)
        attn_hid = self.relu(self.m_hid(d))
        attn_hid = self.dropout(attn_hid)
        score = self.score(attn_hid)
        return score

# Hyperparameters setting and initialization

In [29]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCHSIZE = 256
torch.cuda.empty_cache()
vocab = copy.deepcopy(TEXT.vocab)
m = nn.DataParallel(Model(vocab, 0.1, 100, 100, device=DEVICE).to(DEVICE))
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(m.parameters(), 1e-3, weight_decay=1e-4)
train_iter = torchIter(BATCHSIZE, train, train_y, train_d_len, train_s_len, train_id)
test_iter = torchIter(BATCHSIZE, test, test_y, test_d_len, test_s_len, test_id)

# 5-fold validation

In [30]:
k_fold = 5
fold_len = math.ceil(math.ceil(len(train) / BATCHSIZE) / k_fold)
print(fold_len)
epochs = 15
avg_acc = [[] for _ in range(epochs)]
start_dict = copy.deepcopy(m.state_dict())
nn.utils.clip_grad_norm_(m.parameters(), 10)
for k in range(k_fold):
    print("newk")
    m.load_state_dict(start_dict)
    for e in range(epochs):
        avg = []
        val_set = []
        val_loss = []
        train_acc = []
        m.train()
        for i, batch in enumerate(train_iter):
            if i >= k * fold_len and i < (k + 1) * fold_len:
                continue
            y = batch[1] - 1 # for torch
            score = m(batch[0], batch[2], batch[3])
            pred = torch.argmax(score, -1)
            train_acc.append((torch.sum(pred == y).float() / y.shape[0]).data)
            l = loss(score, y)
            avg.append(l.data)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        m.eval()
        for i, batch in enumerate(train_iter):
            if not (i >= k * fold_len and i < (k + 1) * fold_len):
                continue
            y = batch[1]
            score = nn.functional.softmax(m(batch[0], batch[2], batch[3]))
            pred = torch.argmax(score, -1) + 1
            l = torch.sum(pred == y).float() / y.shape[0]
            val_loss.append(l.data)
            avg_acc[e].append(l.data)
        print('train loss:', np.mean(avg), 'train acc:', np.mean(train_acc), 'val acc:', np.mean(val_loss))
print('avg val acc:', np.mean(avg_acc, 1))

13
newk




train loss: 1.3718334 train acc: 0.43429688 val acc: 0.3921274
train loss: 1.1178372 train acc: 0.4935156 val acc: 0.53155047
train loss: 0.91026473 train acc: 0.58382815 val acc: 0.60847354
train loss: 0.83833504 train acc: 0.6230469 val acc: 0.6207933
train loss: 0.8054542 train acc: 0.64085937 val acc: 0.6340144
train loss: 0.77645576 train acc: 0.65335935 val acc: 0.6451322
train loss: 0.7526119 train acc: 0.6671875 val acc: 0.65084136
train loss: 0.73245454 train acc: 0.67296875 val acc: 0.6550481
train loss: 0.7156479 train acc: 0.68460935 val acc: 0.6583534
train loss: 0.7016935 train acc: 0.6926563 val acc: 0.66376203
train loss: 0.6865207 train acc: 0.69742185 val acc: 0.65865386
train loss: 0.669266 train acc: 0.7084375 val acc: 0.6547476
train loss: 0.64975494 train acc: 0.7153906 val acc: 0.6577524
train loss: 0.62654 train acc: 0.7272656 val acc: 0.6547476
train loss: 0.61174333 train acc: 0.7335156 val acc: 0.64873797
newk
train loss: 1.4760774 train acc: 0.40476564 val a

# Based on validation result, choose the best epoch, retrain the model on training set



In [31]:
m.load_state_dict(start_dict)
m.train()
for e in range(12):
    for batch in train_iter:
        y = batch[1] - 1 # for torch
        score = m(batch[0], batch[2], batch[3])
        l = loss(score, y)
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
    print(l.data)



tensor(1.2997, device='cuda:0')
tensor(1.0676, device='cuda:0')
tensor(0.8935, device='cuda:0')
tensor(0.8238, device='cuda:0')
tensor(0.7700, device='cuda:0')
tensor(0.7152, device='cuda:0')
tensor(0.6988, device='cuda:0')
tensor(0.7019, device='cuda:0')
tensor(0.6860, device='cuda:0')
tensor(0.6393, device='cuda:0')
tensor(0.6233, device='cuda:0')
tensor(0.5847, device='cuda:0')


# Generate the result on testing set

In [32]:
m.eval()
id = []
y = []
for batch in test_iter:
    score = m(batch[0], batch[2], batch[3])
    c = nn.functional.softmax(score)
    pred = torch.argmax(c, -1) + 1
    id.append(batch[4])
    y.append(pred)
id = torch.cat(id, 0)
y = torch.cat(y, 0)
sub_df = pd.DataFrame()
sub_df["id"] = id.cpu().numpy()
sub_df["pred"] = y.cpu().numpy()
sub_df.to_csv("submission.csv", index=False)

  
