In [12]:
import xml.etree.ElementTree as ET
import re
from collections import defaultdict
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
trainFile = 'train.xml'
testFile = 'test.xml'

In [13]:
class Dataset(TensorDataset):
    def __init__(self, p=None, h=None, l=None):
        if p is None and h is None and l is None :
            return
        self.p = torch.from_numpy(np.array(p))
        self.h = torch.from_numpy(np.array(h))
        labels = np.array([[1 if i == 1 else 0, 1 if i ==2 else 0] for i in l])
        self.labels = torch.from_numpy(labels).type(torch.FloatTensor)
    
    def to(self, device):
        newDataset = Dataset()
        newDataset.p = self.p.to(device)
        newDataset.h = self.h.to(device)
        newDataset.labels = self.labels.to(device)
        return newDataset
    
    def __len__(self):
        return len(self.p)
    
    def __getitem__(self, item):
        return [self.p[item], self.h[item]], self.labels[item]

In [14]:
class RNN(nn.Module):
    def __init__(self, vocab_length, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_length, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.dropout = nn.Dropout(0.8)
        self.fc = nn.Linear(2 * hidden_dim, output_dim)
        
    def forward(self, data):
        n = len(data[0])
        et = self.embedding(data[0])
        ht = self.embedding(data[1])
        rt, _ = self.lstm(et.view(n,1,-1))
        rh, _ = self.lstm(ht.view(n,1,-1))
        rth = torch.cat((rt.view(n,-1),rh.view(n,-1)), dim=0)
        fc = self.fc(torch.sum(self.dropout(rth),dim=0))
        return F.softmax(fc, dim=0)
    

In [15]:
class ModelTrainer():
    def __init__(self, model, optimizer, criterion, device):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.criterion = criterion.to(device)
        
    def train(self, trainData, testData, trainSample, testSample, N_EPOCHS=10):
        trainData = trainData.to(device)
        testData = testData.to(device)
        N_EPOCHS = N_EPOCHS
        best_valid_loss = float('inf')
        for epoch in range(N_EPOCHS):
            t = time.time()
            train_loss, train_acc = self.trainModel(trainData, trainSample)
            valid_loss, valid_acc = self.evaluateModel(testData, testSample)
            epoch_secs = time.time() - t            
            print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_secs:.2f}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
            print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
        return model

    def trainModel(self, data, sampler):
        
        epoch_loss = 0
        epoch_acc = 0
        
        self.model.train()
        
        for i in sampler:
            optimizer.zero_grad()
            batch_data = data[i][0]
            batch_label = data[i][1]
            predictions = self.model(batch_data)
            loss = self.criterion(predictions, batch_label)
            acc = self.accuracy(predictions, batch_label)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        return epoch_loss / len(sampler), epoch_acc / len(sampler)
    
    def evaluateModel(self, data, sampler):
        epoch_loss = 0
        epoch_acc = 0
        
        self.model.eval()
        
        with torch.no_grad():
            for i in sampler:
                batch_data = data[i][0]
                batch_label = data[i][1]
                predictions = self.model(batch_data)
                loss = self.criterion(predictions, batch_label)
                acc = self.accuracy(batch_label, predictions)
                epoch_loss += loss.item()
                epoch_acc += acc.item()
            
        return epoch_loss / len(sampler), epoch_acc / len(sampler)
    
    def accuracy(self, y_true, y_pred):
        y_pred = y_pred.data.argmax()
        y_true = y_true.data.argmax()
        return (y_true == y_pred).float()

In [16]:
class XMLToTensor:
    def __init__(self, files, maxLength=None):
        data = np.array([self.parseXML(file) for file in files])
        self.p = np.array([p for f in data for p in f[0]])
        self.h = np.array([p for f in data for p in f[1]])
        self.l = np.array([p for f in data for p in f[2]])
        self.wd = self.createWordToIntegerDict([word for words in self.p for word in words] + 
                                               [word for words in self.h for word in words])
        self.ld = self.createWordToIntegerDict([label for label in self.l])
        if maxLength is None:
            self.maxLength = max([len(s) for s in self.p + self.h])
        else:
            self.maxLength = maxLength
        
    def getTensor(self):
        p,h,l = self.prepareDataset(self.p,self.h,self.l)
        return Dataset(p,h,l)
    
    def splitTensor(self, trainSplit, testSplit):
        if trainSplit + testSplit != 1:
            print('Invalid split')
            return None
        n = len(self.p)
        p,h,l = self.prepareDataset(self.p,self.h,self.l)
        splitI = int(trainSplit * n)
        return Dataset(p[0:splitI],h[0:splitI],l[0:splitI]), Dataset(p[splitI:n], h[splitI:n], l[splitI:n])
        
    def getWordDictionary(self):
        return self.wd
    
    def getLabelDictionary(self):
        return self.ld
    
    def parseXML(self, file):
        tree = ET.parse(file)
        root = tree.getroot()
        pairs = root.findall('./pair')
        p = np.array([re.sub("[^\w]", " ", pair.find('./t').text.lower()).split() for pair in pairs])
        h = np.array([re.sub("[^\w]", " ", pair.find('./h').text.lower()).split() for pair in pairs])
        l = np.array([pair.get('value') for pair in pairs])
        return p,h,l
    
    def createWordToIntegerDict(self,words):
        d = defaultdict(int)
        i = 0
        for word in words:
            if d[word] == 0:
                i += 1
                d[word] = i
        return d
    
    def transformWordsToIntegers(self,w, d):
        return [[d[word] for word in words] for words in w]
        
    def transformListsToUniformLength(self,w, padding=0):
        L = [[padding for _ in range(self.maxLength)] for _ in w]
        for i,l in enumerate(w):
            end = len(l) if len(l) <= self.maxLength else self.maxLength
            L[i][0:end] = l[0:end]
        return L
        
    def prepareDataset(self, p, h, l):
        p = self.transformListsToUniformLength(self.transformWordsToIntegers(p, self.wd))
        h = self.transformListsToUniformLength(self.transformWordsToIntegers(h, self.wd))
        l = [self.ld[l] for l in l]
        return p,h,l
    

In [17]:
data = XMLToTensor([trainFile, testFile], maxLength=30)
train, test = data.splitTensor(0.8,0.2)
wordDict = data.getWordDictionary()

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(len(wordDict) + 1, 30, 30, 2).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
model = model.to(device)
criterion = criterion.to(device)
train = train.to(device)
test = test.to(device)

trainSample, testSample = RandomSampler(train), SequentialSampler(test)

In [19]:
trainer = ModelTrainer(model, optimizer, criterion, device)
trainer.train(train, test, trainSample, testSample, N_EPOCHS=10)

Epoch: 01 | Epoch Time: 4.62s
	Train Loss: 1.311 | Train Acc: 49.31%
	 Val. Loss: 0.817 |  Val. Acc: 45.26%
Epoch: 02 | Epoch Time: 4.57s
	Train Loss: 1.142 | Train Acc: 48.76%
	 Val. Loss: 0.822 |  Val. Acc: 50.00%
Epoch: 03 | Epoch Time: 4.50s
	Train Loss: 1.062 | Train Acc: 48.12%
	 Val. Loss: 0.834 |  Val. Acc: 51.82%
Epoch: 04 | Epoch Time: 4.53s
	Train Loss: 0.958 | Train Acc: 50.96%
	 Val. Loss: 0.735 |  Val. Acc: 48.18%
Epoch: 05 | Epoch Time: 4.53s
	Train Loss: 0.915 | Train Acc: 51.88%
	 Val. Loss: 0.728 |  Val. Acc: 51.09%
Epoch: 06 | Epoch Time: 4.50s
	Train Loss: 0.832 | Train Acc: 52.88%
	 Val. Loss: 0.730 |  Val. Acc: 50.73%
Epoch: 07 | Epoch Time: 4.56s
	Train Loss: 0.797 | Train Acc: 53.34%
	 Val. Loss: 0.733 |  Val. Acc: 50.00%
Epoch: 08 | Epoch Time: 4.55s
	Train Loss: 0.786 | Train Acc: 54.16%
	 Val. Loss: 0.712 |  Val. Acc: 53.65%
Epoch: 09 | Epoch Time: 4.53s
	Train Loss: 0.749 | Train Acc: 55.26%
	 Val. Loss: 0.706 |  Val. Acc: 55.11%
Epoch: 10 | Epoch Time: 4.50

RNN(
  (embedding): Embedding(7559, 30)
  (lstm): LSTM(30, 30, bidirectional=True)
  (dropout): Dropout(p=0.8, inplace=False)
  (fc): Linear(in_features=60, out_features=2, bias=True)
)

In [20]:
torch.randn(10,10).squeeze((1,1))

TypeError: squeeze() received an invalid combination of arguments - got (tuple), but expected one of:
 * ()
      didn't match because some of the arguments have invalid types: (!tuple!)
 * (name dim)
      didn't match because some of the arguments have invalid types: (!tuple!)
 * (int dim)
      didn't match because some of the arguments have invalid types: (!tuple!)
