In [22]:
from lime.lime_text import TextLIME

In [2]:
import torch
from torchtext.datasets import AG_NEWS
train_iter = AG_NEWS(split='train')

C:\Users\akgmn\Downloads\02460-Advanced-Machine-Learning-master\.data\train.csv: 29.5MB [00:01, 28.8MB/s]                            


In [3]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')
counter = Counter()
for (label, line) in train_iter:
    counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=1)


text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: int(x) - 1

In [4]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

train_iter = AG_NEWS(split='train')
dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [5]:
from torch import nn

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [6]:
train_iter = AG_NEWS(split='train')
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [7]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [8]:
from torch.utils.data.dataset import random_split
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = AG_NEWS()
train_dataset = list(train_iter)
test_dataset = list(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

C:\Users\akgmn\Downloads\02460-Advanced-Machine-Learning-master\.data\test.csv: 1.86MB [00:00, 12.8MB/s]                  


| epoch   1 |   500/ 1782 batches | accuracy    0.673
| epoch   1 |  1000/ 1782 batches | accuracy    0.853
| epoch   1 |  1500/ 1782 batches | accuracy    0.878
-----------------------------------------------------------
| end of epoch   1 | time: 51.73s | valid accuracy    0.890 
-----------------------------------------------------------
| epoch   2 |   500/ 1782 batches | accuracy    0.898
| epoch   2 |  1000/ 1782 batches | accuracy    0.900
| epoch   2 |  1500/ 1782 batches | accuracy    0.902
-----------------------------------------------------------
| end of epoch   2 | time: 43.74s | valid accuracy    0.898 
-----------------------------------------------------------
| epoch   3 |   500/ 1782 batches | accuracy    0.915
| epoch   3 |  1000/ 1782 batches | accuracy    0.914
| epoch   3 |  1500/ 1782 batches | accuracy    0.913
-----------------------------------------------------------
| end of epoch   3 | time: 44.87s | valid accuracy    0.902 
-------------------------------

In [13]:
import numpy as np

In [102]:
def get_sample_predictions(indexed_string):
    ag_news_label = {1: "World",
                       2: "Sports",
                       3: "Business",
                       4: "Sci/Tec"}
    prediction_probs = []
    pred = []
    with torch.no_grad():
        text = torch.tensor(indexed_string)
        output = model(text, torch.tensor([0]))
    # The output has unnormalized scores. To get probabilities, run a softmax on it.
    prediction_probs.append(torch.nn.functional.softmax(output[0], dim=0))
    res = output.argmax(1).item()
    pred.append(res + 1)
    prediction_probs = np.array([i.numpy() for i in prediction_probs])
    return prediction_probs[0], pred

In [95]:
get_sample_predictions("hi you")

(array([2.8314906e-10, 2.6212803e-01, 6.4288867e-03, 7.3144311e-01],
       dtype=float32),
 [4])

In [89]:
from lime.indexers import StringTokenizer

In [108]:
ex_text_str2 = """
    GameStop shares plunged Wednesday after the long-struggling Grapevine, 
    Texas-based gameseller reported earnings that failed to impress analysts, 
    but the firm's new ecommerce push prompted one expert to release Wall Street's most bullish price target,
    marking the first analyst to say that a turnaround could actually justify GameStop's meteoric valuation.
""".strip().replace('\n', '')

In [129]:
ex_text_str2.strip().replace('  ', '').replace(',', '').replace('.','').lower()

"gamestop shares plunged wednesday after the long-struggling grapevine texas-based gameseller reported earnings that failed to impress analysts but the firm's new ecommerce push prompted one expert to release wall street's most bullish price targetmarking the first analyst to say that a turnaround could actually justify gamestop's meteoric valuation"

In [91]:
ex_text_str = """
    GameStop shares plunged Wednesday after the long-struggling Grapevine.
""".strip().replace('\n', '')

In [107]:
text_pipeline(ex_text_str)

[18895, 188, 4349, 57, 35, 3, 0, 71507, 2]

In [134]:
del TextLIME
from lime.lime_text import TextLIME

In [135]:
indexer = StringTokenizer(text_pipeline)
exp = TextLIME(indexer=indexer)

exp.explain_instance(ex_text_str, get_sample_predictions, num_samples=10)

UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U69'), dtype('<U69')) -> dtype('<U69')

In [None]:
[1,2,3,4]*0

In [76]:
np.array([1,2,300])*0

array([0, 0, 0])