In [2]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, TensorDataset
import pickle

In [10]:
# Load the dictionaries
with open('models/id2word.pkl', 'rb') as f:
    id2word = pickle.load(f)
with open('models/id2tag.pkl', 'rb') as f:
    id2tag = pickle.load(f)
with open('models/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
with open('models/nertags.pkl', 'rb') as f:
    nertags = pickle.load(f)

In [3]:
class BiLSTM(nn.Module):
    def __init__(self, embedding_size, hidden_size, total_words, num_class):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.wordembed = nn.Embedding(total_words, embedding_size)
        self.dropout = nn.Dropout(p=0.5)
        self.bilstm = nn.LSTM(embedding_size, hidden_size, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(2 * hidden_size, num_class)

    def forward(self, x, xlengths):
        x = torch.nn.utils.rnn.pack_padded_sequence(x, xlengths.cpu(), batch_first=True, enforce_sorted=False)
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        word_embedding = self.wordembed(x)
        word_embedding = self.dropout(word_embedding)

        out, (h, c) = self.bilstm(word_embedding)
        out = self.linear(out)
        out = out.view(-1, out.shape[2])
        out = torch.nn.functional.log_softmax(out, dim=1)
        return out

# Initialize your model instance with the same architecture as the trained model
model = BiLSTM(embedding_size=100, hidden_size=100, total_words=len(vocab), num_class=18)

# Load the saved model state dictionary
model.load_state_dict(torch.load('models/trained_bilstm_model.pth'))

# Put the model in evaluation mode
model.eval()


BiLSTM(
  (wordembed): Embedding(23626, 100)
  (dropout): Dropout(p=0.5, inplace=False)
  (bilstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=200, out_features=18, bias=True)
)

In [4]:
state_dict = model.state_dict()

# Print the keys and shapes of the parameters in the state dictionary
for key, value in state_dict.items():
    print(key, value.shape)

wordembed.weight torch.Size([23626, 100])
bilstm.weight_ih_l0 torch.Size([400, 100])
bilstm.weight_hh_l0 torch.Size([400, 100])
bilstm.bias_ih_l0 torch.Size([400])
bilstm.bias_hh_l0 torch.Size([400])
bilstm.weight_ih_l0_reverse torch.Size([400, 100])
bilstm.weight_hh_l0_reverse torch.Size([400, 100])
bilstm.bias_ih_l0_reverse torch.Size([400])
bilstm.bias_hh_l0_reverse torch.Size([400])
linear.weight torch.Size([18, 200])
linear.bias torch.Size([18])


In [14]:
def out_predictions(model, loader, output_file):
    with open(output_file, 'w') as f:
        with torch.no_grad():
            for step, (X, Y, xlen) in enumerate(loader):
                Y = pack_padded_sequence(Y, xlen, batch_first=True, enforce_sorted=False)
                Y, _ = pad_packed_sequence(Y, batch_first=True)
                ypred = model(X.long().to(device), xlen.to(device))
                ypred = torch.argmax(ypred.to('cpu'), dim=1)
                ypred = ypred.view(Y.shape[0], -1)
                for i in range(len(ypred)):
                    for j in range(len(ypred[i])):
                        word = id2word[int(X[i, j])]
                        tag = id2tag[int(ypred[i, j])]
                        f.write(f"{word}\t{tag}\n")
                    f.write('\n')

# Assuming 'device' and 'id2word', 'id2tag' are defined elsewhere
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
def load_data(datapath):
    sentences = []
    tags = []
    with open(datapath) as f:
        lines = f.readlines()
        sentence = []
        tag = []
        for line in lines:
            line = line.strip()  # Remove leading/trailing whitespace
            if line:  # If the line is not empty
                word, tag_label = line.split('\t')
                if vocab is not None:
                    if word in vocab.keys():
                        sentence.append(vocab[word])
                    else:
                        sentence.append(vocab['<oov>'])
                if nertags is not None:
                    tag.append(nertags[tag_label])
            else:  # If the line is empty, indicating end of a sentence
                if sentence:
                    sentences.append(sentence)
                    tags.append(tag)
                    sentence = []
                    tag = []

    # Padding the sentences at the end
    max_length = max(len(x) for x in sentences)
    x_lengths = [len(x) for x in sentences]
    X_test = []
    Y_test = []
    for sent, tag in zip(sentences, tags):
        length_to_append = max_length - len(sent)
        X_test.append(sent + [0] * length_to_append)  # Padding with zeros
        Y_test.append(tag + [0] * length_to_append)  # Padding with zeros

    X_test = torch.Tensor(X_test)
    Y_test = torch.Tensor(Y_test)
    x_lengths = torch.Tensor(x_lengths)

    return X_test, Y_test, x_lengths


In [16]:
testdatapath = 'data/altered_capitalization_swap_rate_0.1.txt'

# Test dataset preparation
Xtest, Ytest, x_testlengths = load_data(testdatapath)

testdataset = TensorDataset(Xtest, Ytest, x_testlengths)
loader_test = DataLoader(testdataset, batch_size=1, shuffle=False)

# Output predictions
out_predictions(model, loader_test, 'out/predictions_altered_capitalization_swap_rate_0.1.txt')