In [1]:
import xml.etree.ElementTree as ET
import re
from collections import defaultdict
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
trainFile = 'train.xml'
testFile = 'test.xml'

In [2]:
class Dataset(TensorDataset):
    def __init__(self, p=None, h=None, l=None):
        if p is None and h is None and l is None :
            return
        self.p = torch.from_numpy(np.array(p))
        self.h = torch.from_numpy(np.array(h))
        labels = [[1 if i == 1 else 0, 1 if i ==2 else 0] for i in l]
        self.labels = torch.from_numpy(np.array(labels)).type(torch.FloatTensor)
    
    def to(self, device):
        newDataset = Dataset()
        newDataset.p = self.p.to(device)
        newDataset.h = self.h.to(device)
        newDataset.labels = self.labels.to(device)
        return newDataset
    
    def __len__(self):
        return len(self.p)
    
    def __getitem__(self, item):
        return [self.p[item], self.h[item]], self.labels[item]
    
class RNN(nn.Module):
    def __init__(self, vocab_length, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_length, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_dim, output_dim)
        
    def forward(self, data):
        n = len(data[0])
        et = self.embedding(data[0])
        #print(et.shape)
        ht = self.embedding(data[1])
        rt, _ = self.lstm(et.view(n,1,-1))
        rh, _ = self.lstm(ht.view(n,1,-1))
        rth = torch.cat((rt.view(n,-1),rh.view(n,-1)), dim=0)
        fc = self.fc(torch.sum(rth,dim=0))
        return F.softmax(fc, dim=0)
    
def train(model, optimizer, criterion, data, sampler):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for i in sampler:
        optimizer.zero_grad()
        batch_data = data[i][0]
        batch_label = data[i][1]
        predictions = model(batch_data)
        loss = criterion(predictions, batch_label)
        acc = equal(batch_label, predictions)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(sampler), epoch_acc / len(sampler)

def evaluate(model, criterion, data, sampler):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for i in sampler:
            batch_data = data[i][0]
            batch_label = data[i][1]
            predictions = model(batch_data)
            loss = criterion(predictions, batch_label)
            acc = equal(batch_label, predictions)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(sampler), epoch_acc / len(sampler)

def equal(y_true, y_pred):
    y_pred = y_pred.data.argmax()
    y_true = y_true.data.argmax()
    return (y_true == y_pred).float()
# Parses XML file for entailment-corpus.
def parseXML(trainFile):
    tree = ET.parse(trainFile)
    root = tree.getroot()
    pairs = root.findall('./pair')
    p = [re.sub("[^\w]", " ", pair.find('./t').text.lower()).split() for pair in pairs]
    h = [re.sub("[^\w]", " ", pair.find('./h').text.lower()).split() for pair in pairs]
    l = [pair.get('value') for pair in pairs] 
    return p,h,l

def createWordToIntegerDict(words):
    d = defaultdict(int)
    i = 0
    for word in words:
        if d[word] == 0:
            i += 1
            d[word] = i
    return d

def transformWordsToIntegers(w, d):
    return [[d[word] for word in words] for words in w]
    
def transformListsToUniformLength(w, maxLength, padding=0):
    L = [[padding for _ in range(maxLength)] for _ in w]
    for i,list in enumerate(w):
        end = len(list) if len(list) <= maxLength else maxLength
        L[i][0:end] = list[0:end]
    return L
    
def prepareDataset(p, h, l, wd, ld, maxLength=0):
    if maxLength == 0:
        maxLength = max([len(s) for s in p + h])
    print(maxLength)
    p = transformListsToUniformLength(transformWordsToIntegers(p, wd),maxLength=maxLength)
    h = transformListsToUniformLength(transformWordsToIntegers(h, wd),maxLength=maxLength)
    l = [ld[l] for l in l]
    return Dataset(p,h,l), maxLength
    

In [3]:
pTrain, hTrain, lTrain = parseXML(trainFile)
pTest, hTest, lTest = parseXML(testFile)
wd = createWordToIntegerDict([word for words in pTrain for word in words] + 
                             [word for words in hTrain for word in words])
ld = createWordToIntegerDict([label for label in lTrain])
trainData, maxLength = prepareDataset(pTrain, hTrain, lTrain,wd,ld,maxLength=0)
testData, _ = prepareDataset(pTest, hTest, lTest,wd,ld,maxLength=maxLength)

62
62


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(len(wd), 10, 100, 2)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-3)
model = model.to(device)
criterion = criterion.to(device)
trainData = trainData.to(device)
testData = testData.to(device)

trainSample, testSample = (RandomSampler(trainData), SequentialSampler(testData))

In [None]:
N_EPOCHS = 200

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    t = time.time()
    train_loss, train_acc = train(model, optimizer, criterion, trainData, trainSample)
    valid_loss, valid_acc = evaluate(model, criterion, testData, testSample)
    epoch_secs = time.time() - t
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 4.078298568725586s
	Train Loss: 0.756 | Train Acc: 49.03%
	 Val. Loss: 0.752 |  Val. Acc: 50.00%
Epoch: 02 | Epoch Time: 4.088341474533081s
	Train Loss: 0.769 | Train Acc: 46.21%
	 Val. Loss: 0.753 |  Val. Acc: 50.00%
Epoch: 03 | Epoch Time: 4.0809080600738525s
	Train Loss: 0.763 | Train Acc: 47.27%
	 Val. Loss: 0.752 |  Val. Acc: 50.00%
Epoch: 04 | Epoch Time: 4.073481559753418s
	Train Loss: 0.756 | Train Acc: 49.03%
	 Val. Loss: 0.752 |  Val. Acc: 50.00%
Epoch: 05 | Epoch Time: 4.078616380691528s
	Train Loss: 0.751 | Train Acc: 50.09%
	 Val. Loss: 0.751 |  Val. Acc: 50.00%
Epoch: 06 | Epoch Time: 4.1003196239471436s
	Train Loss: 0.751 | Train Acc: 50.44%
	 Val. Loss: 0.751 |  Val. Acc: 50.00%
Epoch: 07 | Epoch Time: 4.0715556144714355s
	Train Loss: 0.755 | Train Acc: 49.21%
	 Val. Loss: 0.751 |  Val. Acc: 50.00%
Epoch: 08 | Epoch Time: 4.086809396743774s
	Train Loss: 0.766 | Train Acc: 46.56%
	 Val. Loss: 0.753 |  Val. Acc: 50.00%
Epoch: 09 | Epoch Time: 4.083

In [None]:
m = nn.Conv2d((135,2), stride=(2,1), padding=(4,2), dilation=(3,1))
input = torch.randn(135,2)

output = m(input)
output
