In [1]:
from __future__ import print_function
import math
import numpy as np
import json
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import defaultdict
from make_fn_data import load_fn_data
from transformers import BertTokenizer, BertModel, BertForMaskedLM

I0616 01:47:04.358864 19356 file_utils.py:39] PyTorch version 1.2.0+cu92 available.
  from ._conv import register_converters as _register_converters


In [2]:
# Load and prepare data
data = load_fn_data()

statistics
# data points:  200751
# lex units without data:  3271


In [3]:
print("# frames: ", len(set([x["frame"] for x in data])))

# frames:  1073


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

In [None]:
# create datapoints from data

frame_dict = {}
frame_dict_rev = {}

inputs = []
labels = []

for lu in data[:10]:
    frame =  lu["frame"]
    if not frame in frame_dict.keys():
        frame_dict[frame] = len(frame_dict.keys())
        frame_dict_rev[frame_dict[frame]] = frame
    frame_id = frame_dict[frame]
    
    for sentence in lu["sentences"]:
        text = sentence["text"]
        indexes = sentence["indexes"]        
        start = min([int(i[0]) for i in indexes])
        end = max([int(i[1]) for i in indexes])
        inputs.append((text, start, end))
        labels.append(frame_id)
        
print("# datapoints = ", len(labels))
print("max labels = ", max(labels))
print(len(frame_dict.keys()))

In [None]:
# You should build your custom dataset as below.
class NpClassDataset(torch.utils.data.Dataset):

    def __init__(self, inputs, labels):
        """
        arguments should be numpy arrays with shapes:
        inputs: (N, F)
        labels: (N, 1)
        Where N = number of data points and F = number of features
        """
        self.inputs = inputs
        self.labels = labels
        
    def __getitem__(self, index):
        x = torch.from_numpy(self.inputs[index].astype(np.float32))
        y = torch.from_numpy(np.squeeze(self.labels[index]).astype(np.longlong))
        return x, y
        
    def __len__(self):        
        return self.labels.shape[0]

class Model():

    def __init__(self, net, criterion, optimizer):
        # try to move model to GPU, if exists
        self.device = torch.device('cpu')
        if torch.cuda.is_available():
            self.device = torch.device('cuda')

        self.net = net
        self.net.to(self.device)
        self.criterion = criterion 
        self.optimizer = optimizer
        # used to compute probabilities
        self.softmax = nn.Softmax(dim=1)

    def fit(self, dataset, n_epochs = 1, batch_size = 10, verbose=True, print_every=20):
        # create data loader from data set
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset, batch_size=batch_size, shuffle=True)

        # loop over the dataset multiple times
        for epoch in range(n_epochs):
            running_loss = 0.0
            for i, (inputs, labels) in enumerate(data_loader, 0):
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)

                # zero the parameter gradients
                self.optimizer.zero_grad()

                # forward + backward + optimize
                outputs = self.net(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                
                running_loss += loss.item()
                if verbose:
                    # print statistics
                    # prints every "print_every" mini-batches      
                    if i % print_every == (print_every - 1):
                        print('[%2d, %5d] loss: %.3f' %
                              (epoch + 1, i + 1, running_loss / print_every))
                        running_loss = 0.0
        
        print("Training finished")


    def test(self, dataset, batch_size=10):
        label_correct = defaultdict(lambda: 0)
        label_total = defaultdict(lambda: 0)

        data_loader = torch.utils.data.DataLoader(
            dataset=dataset, batch_size=batch_size, shuffle=False)

        with torch.no_grad():
            for (inputs, labels) in data_loader:
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)
                predicted = self.predict(inputs)
                
                for (label, pred) in zip(labels.tolist(), predicted.tolist()):
                    label_correct[label] += 1 if label == pred else 0
                    label_total[label] += 1
        total = 0
        correct = 0
        for label in label_correct.keys():
            print("Label %s accuracy %d\n" %
                (label, 100 * label_correct[label] / label_total[label]))
            correct += label_correct[label]
            total += label_total[label]

        # TODO: only using accuracy, allow for other metrics
        print('Total accuracy on %d data points: %d %%' % (
            dataset.__len__(), 100 * correct / total))

    def predict_dataset(self, dataset, batch_size=10):
        predicted_lst = []
        probs_lst = []
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset, batch_size=batch_size, shuffle=False)    
        with torch.no_grad():
            for (inputs, _) in data_loader:
                inputs = inputs.to(self.device)
                predicted, probs = self.predict(inputs)
                predicted_lst.append(predicted)
        predicted_tensor = torch.cat(predicted_lst, 0)
        probs_tensor = torch.cat(probs_lst, 0)
        return predicted_tensor, probs_tensor
    
    def predict(self, inputs):
        with torch.no_grad():
            outputs = self.net(inputs)
            probabilities = self.softmax(outputs)
            _, predicted = torch.max(outputs.data, 1)
            return predicted, probabilities[predicted]

    def predict_top_k_dataset(dataset, k, batch_size=1):
        predicted_lst = []
        probs_lst = []
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset, batch_size=batch_size, shuffle=False)    
        with torch.no_grad():
            for (inputs, _) in data_loader:
                predicted, probs = predict_top_k(inputs, k)
                predicted_lst.append(predicted)
                probs_lst.append(probs)
        predicted_tensor = torch.cat(predicted_lst, 0)
        probs_tensor = torch.cat(probs_lst, 0)
        return predicted_tensor, probs_tensor
    
    def predict_top_k(inputs, k):
        inputs = inputs.to(self.device)
        with torch.no_grad():
            outputs = net(inputs)
            logits, predicted = torch.topk(outputs.data, k, dim = 1)
            probs = self.softmax(logits)
            return predicted, probs

In [None]:
# You should build your custom dataset as below.
class FnBertDataset(torch.utils.data.Dataset):
    
    def __init__(self, inputs, labels, frame_dict, tokenizer, bert_model):
        """
        First two arguments should be lists with the format:
        inputs: [(text1, start1, end1), ...]
        labels: [label_id1, ...]
        """
        self.inputs = inputs
        self.labels = labels
        
        self.tokenizer = tokenizer
        self.bert_model = bert_model
        
        self.MAX_LEN = 3
        self.INPUT_DIM = self.MAX_LEN * self.bert_model.config.hidden_size
        self.OUTPUT_DIM = len(frame_dict.keys())
        
    def __getitem__(self, index):
        text, start, end = self.inputs[index]
        x = self.get_bert_hidden_state(text, start, end)
        y = torch.tensor(self.labels[index]).long()        
        return x, y
        
    def __len__(self):
        return len(self.labels)
    
    def get_bert_hidden_state(self, text, start, end):
        text = "[CLS] " + text + " [SEP]"
        start += len("[CLS] ")
        end += len("[CLS] ")
        
        # Compute start end end using token indexes
        tk_start, tk_end = self.pos_to_token_idx(text, start, end)
        tk_end = min(tk_start + self.MAX_LEN, tk_end)
        # Tokenize input
        tokenized_text = self.tokenizer.tokenize(text)
    
        # Convert token to vocabulary indices
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
        # Predict hidden states features for each layer
        with torch.no_grad():
            outputs = self.bert_model(tokens_tensor)
            # Hidden state of the last layer of the Bert model
            hidden = torch.squeeze(outputs[0], dim = 0)
            # Slice hidden state to hidden[start:end]
            hidden = hidden.narrow(0, tk_start, tk_end-tk_start)
            # Add padding
            pad = torch.zeros(self.MAX_LEN, hidden.size()[1])            
            pad[0:hidden.size()[0],:] = hidden
            hidden = torch.flatten(pad)
            return hidden

    def pos_to_token_idx(self, text, start, end):
        target_prefix = self.tokenizer.tokenize(text[:start])
        target = self.tokenizer.tokenize(text[start:end+1])
        tk_start = len(target_prefix)
        tk_end = tk_start + len(target)
        return tk_start, tk_end
    
bert_model.to('cuda')
# dataset = FnBertDataset([inputs[0], inputs[-1]], [labels[0], labels[-1]], frame_dict, tokenizer, bert_model)
dataset = FnBertDataset(inputs, labels, frame_dict, tokenizer, bert_model)

import time
start_time = time.time()
for i in range(10):
    dataset[i]
print("elapsed time seconds = ", time.time() - start_time)

print("dataset in = ", dataset[0][0])
print("dataset out = ", dataset[0][1], dataset[0][1].type())
print("dimensions: in =", dataset.INPUT_DIM, " out = ", dataset.OUTPUT_DIM)

In [None]:
def create_net(input_dim, output_dim):
    layers = [
        nn.Linear(input_dim, 100),
        nn.ReLU(),
        nn.Linear(100, output_dim),    
    ]
    model = nn.Sequential(*layers)
    return model

# Run training & testing
net = create_net(input_dim = dataset.INPUT_DIM, output_dim = dataset.OUTPUT_DIM)
model = Model(net, criterion = nn.CrossEntropyLoss(),
              optimizer=optim.Adam(net.parameters(), lr=10e-4))
model.fit(dataset, n_epochs=5, batch_size=10, verbose=True, print_every=5)

In [None]:
torch.save(
    net.state_dict(), 'C:\\Users\\danil\\Documents\\Northwestern\\Research\\projects\\frame_classification\\state_dict_small')
torch.save(
    net, 'C:\\Users\\danil\\Documents\\Northwestern\\Research\\projects\\frame_classification\\net_small')

In [None]:
dev_dataset = FnBertDataset(inputs, labels, frame_dict, tokenizer, bert_model)
# dev_dataset = FnBertDataset([inputs[0], inputs[-1]], [labels[0], labels[-1]], 
#                             frame_dict, tokenizer, bert_model)
print(len(dev_dataset))
model.test(dev_dataset)

In [None]:
print(frame_dict)