In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, TensorDataset
import pickle

In [10]:
# Load the dictionaries
with open('models/bilstm_char/id2word.pkl', 'rb') as f:
    id2word = pickle.load(f)
with open('models/bilstm_char/id2tag.pkl', 'rb') as f:
    id2tag = pickle.load(f)
with open('models/bilstm_char/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
with open('models/bilstm_char/nertags.pkl', 'rb') as f:
    nertags = pickle.load(f)

In [7]:
## LSTM for CHARACTER level

class forLSTM(nn.Module):
    def __init__(self, embedding_size, hidden_size, pretr_char_embed):
        super(forLSTM, self).__init__()
        self.charembed = nn.Embedding.from_pretrained(pretr_char_embed, freeze = False) #size of pretrained = (totalchars,embedding size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, bidirectional = True, batch_first = True)

    def forward(self, xchar, xlength_char):
        #xchar is of shape(batchsize, seqlen_maxinbatch, maxwordlen-ie max char = 6)

        shape = xchar.shape
        xchar = xchar.view(-1, shape[2])
        xlength_char = xlength_char.view(-1)
        input = pack_padded_sequence(xchar, xlength_char.cpu(), batch_first=True, enforce_sorted=False)
        input, _ = pad_packed_sequence(input, batch_first=True)
        embed = self.charembed(input)
        _, (h,_) = self.lstm(embed) #h is of size (2, 128*maxno. of words in a sentence in the batch, 25)
        h = h.view(h.shape[1], 50)
        h = h.view(shape[0], shape[1], 50)
        return h

In [8]:
## BILSTM model

class BiLSTM(nn.Module):
    def __init__(self, embedding_size, hidden_size, total_words, num_class, pretrained = False, pretrained_embed = None, char_embed_size = 0, pretr_char_embed = None):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.wordembed = nn.Embedding.from_pretrained(pretrained_embed, freeze = False)
        self.for_charembed = forLSTM(embedding_size = char_embed_size, hidden_size = 25, pretr_char_embed = pretr_char_embed)
        self.dropout = nn.Dropout(p = 0.5)
        self.bilstm = nn.LSTM(embedding_size + 50,hidden_size, bidirectional = True, batch_first = True)
        self.linear = nn.Linear(2*hidden_size, num_class) # 2 because forward and backward concatenate

    def forward(self, x, xchar, xlengths, xlength_char):
        x = pack_padded_sequence(x, xlengths.cpu(), batch_first=True, enforce_sorted=False)
        x, _ = pad_packed_sequence(x, batch_first=True)

        xlength_char = pack_padded_sequence(xlength_char, xlengths.cpu(), batch_first=True, enforce_sorted=False)
        xlength_char, _ = pad_packed_sequence(xlength_char, batch_first=True, padding_value = len("<pad>")) 
        # above this line padding value is taken as len of pad word becasue that is what we pad sentences 
        # with hance at a character level it should be the length

        xchar = pack_padded_sequence(xchar, xlengths.cpu(), batch_first=True, enforce_sorted=False)
        xchar, _ = pad_packed_sequence(xchar, batch_first=True)

        word_embedding = self.wordembed(x) # x is of size(batchsize, seq_len), wordembed is of size (batchsize, seq_len, embedding_size = 100)
        forwardchar= self.for_charembed(xchar, xlength_char) #forwardchar and backwardchar would be of size (batchsize, seqlen. embedding_size = 25each) 
        word_embedding = torch.cat((word_embedding, forwardchar), dim = 2)

        word_embedding = self.dropout(word_embedding) #dropout
        out, (h,c) = self.bilstm(word_embedding) #'out' has dimension(batchsize, seq_len, 2*hidden_size)

        out = self.linear(out) #now 'out' has dimension(batchsize, seq_len, num_class)
        out = out.view(-1, out.shape[2]) # shape (128*seqlen, 18)
        out = F.log_softmax(out, dim=1) # take the softmax across the dimension num_class, 'out' has dimension(batchsize, seq_len, num_class)
        return out

In [9]:
model = BiLSTM(embedding_size = 100, hidden_size = 100, total_words = len(vocab), num_class = 18, char_embed_size = len(char_vocab)) 
model.load_state_dict(torch.load('models/bilstm_char/trained_bilstm_model_state_dict.pth'))

model.eval()


AttributeError: 'NoneType' object has no attribute 'dim'

In [14]:
def out_predictions(model, loader, output_file):
    with open(output_file, 'w') as f:
        with torch.no_grad():
            for step, (X, Y, xlen) in enumerate(loader):
                Y = pack_padded_sequence(Y, xlen, batch_first=True, enforce_sorted=False)
                Y, _ = pad_packed_sequence(Y, batch_first=True)
                ypred = model(X.long().to(device), xlen.to(device))
                ypred = torch.argmax(ypred.to('cpu'), dim=1)
                ypred = ypred.view(Y.shape[0], -1)
                for i in range(len(ypred)):
                    for j in range(len(ypred[i])):
                        word = id2word[int(X[i, j])]
                        tag = id2tag[int(ypred[i, j])]
                        f.write(f"{word}\t{tag}\n")
                    f.write('\n')

# Assuming 'device' and 'id2word', 'id2tag' are defined elsewhere
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
def load_data(datapath):
    sentences = []
    tags = []
    with open(datapath) as f:
        lines = f.readlines()
        sentence = []
        tag = []
        for line in lines:
            line = line.strip()  # Remove leading/trailing whitespace
            if line:  # If the line is not empty
                word, tag_label = line.split('\t')
                if vocab is not None:
                    if word in vocab.keys():
                        sentence.append(vocab[word])
                    else:
                        sentence.append(vocab['<oov>'])
                if nertags is not None:
                    tag.append(nertags[tag_label])
            else:  # If the line is empty, indicating end of a sentence
                if sentence:
                    sentences.append(sentence)
                    tags.append(tag)
                    sentence = []
                    tag = []

    # Padding the sentences at the end
    max_length = max(len(x) for x in sentences)
    x_lengths = [len(x) for x in sentences]
    X_test = []
    Y_test = []
    for sent, tag in zip(sentences, tags):
        length_to_append = max_length - len(sent)
        X_test.append(sent + [0] * length_to_append)  # Padding with zeros
        Y_test.append(tag + [0] * length_to_append)  # Padding with zeros

    X_test = torch.Tensor(X_test)
    Y_test = torch.Tensor(Y_test)
    x_lengths = torch.Tensor(x_lengths)

    return X_test, Y_test, x_lengths


In [16]:
testdatapath = 'data/altered_capitalization_swap_rate_0.1.txt'

# Test dataset preparation
Xtest, Ytest, x_testlengths = load_data(testdatapath)

testdataset = TensorDataset(Xtest, Ytest, x_testlengths)
loader_test = DataLoader(testdataset, batch_size=1, shuffle=False)

# Output predictions
out_predictions(model, loader_test, 'out/predictions_altered_capitalization_swap_rate_0.1.txt')