In [8]:
from __future__ import print_function
import math
import numpy as np
import json
import time
import matplotlib.pyplot as plt
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import defaultdict
from make_fn_data import load_fn_data
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from neural_net import Model, NpClassDataset

In [2]:
# Load and prepare data
data = load_fn_data()

statistics
# lex units:  13572
# frames:  1073
# data points:  200751
# lex units without data:  3271


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()
bert_model.to('cuda')

I0616 04:16:14.863266 29344 tokenization_utils.py:1075] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\danil/.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0616 04:16:15.103193 29344 configuration_utils.py:265] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at C:\Users\danil/.cache\torch\transformers\4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
I0616 04:16:15.104233 29344 configuration_utils.py:301] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm

In [11]:
# create datapoints from data

frame_dict = {}
frame_dict_rev = {}

inputs = []
labels = []

for lu in data[:50]:
    frame =  lu["frame"]
    if not frame in frame_dict.keys():
        frame_dict[frame] = len(frame_dict.keys())
        frame_dict_rev[frame_dict[frame]] = frame
    frame_id = frame_dict[frame]
    
    for sentence in lu["sentences"]:
        text = sentence["text"]
        indexes = sentence["indexes"]        
        start = min([int(i[0]) for i in indexes])
        end = max([int(i[1]) for i in indexes])
        inputs.append((text, start, end))
        labels.append(frame_id)

print("# datapoints = ", len(labels))
print("max labels = ", max(labels))
print(len(frame_dict.keys()))

# datapoints =  807
max labels =  16
17


In [12]:
# You should build your custom dataset as below.
class FnBertDataset(torch.utils.data.Dataset):
    
    def __init__(self, inputs, labels, frame_dict, tokenizer, bert_model):
        """
        First two arguments should be lists with the format:
        inputs: [(text1, start1, end1), ...]
        labels: [label_id1, ...]
        """
        self.inputs = inputs
        self.labels = labels
        
        self.tokenizer = tokenizer
        self.bert_model = bert_model
        
        self.MAX_LEN = 3
        self.INPUT_DIM = self.MAX_LEN * self.bert_model.config.hidden_size
        self.OUTPUT_DIM = len(frame_dict.keys())
        
    def __getitem__(self, index):
        text, start, end = self.inputs[index]
        x = self.get_bert_hidden_state(text, start, end)
        y = torch.tensor(self.labels[index]).long()        
        return x, y
        
    def __len__(self):
        return len(self.labels)
    
    def get_bert_hidden_state(self, text, start, end):
        text = "[CLS] " + text + " [SEP]"
        start += len("[CLS] ")
        end += len("[CLS] ")
        
        # Compute start end end using token indexes
        tk_start, tk_end = self.pos_to_token_idx(text, start, end)
        tk_end = min(tk_start + self.MAX_LEN, tk_end)
        # Tokenize input
        tokenized_text = self.tokenizer.tokenize(text)
    
        # Convert token to vocabulary indices
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
        # Predict hidden states features for each layer
        with torch.no_grad():
            outputs = self.bert_model(tokens_tensor)
            # Hidden state of the last layer of the Bert model
            hidden = torch.squeeze(outputs[0], dim = 0)
            # Slice hidden state to hidden[start:end]
            hidden = hidden.narrow(0, tk_start, tk_end-tk_start)
            # Add padding
            pad = torch.zeros(self.MAX_LEN, hidden.size()[1])            
            pad[0:hidden.size()[0],:] = hidden
            hidden = torch.flatten(pad)
            return hidden

    def pos_to_token_idx(self, text, start, end):
        target_prefix = self.tokenizer.tokenize(text[:start])
        target = self.tokenizer.tokenize(text[start:end+1])
        tk_start = len(target_prefix)
        tk_end = tk_start + len(target)
        return tk_start, tk_end
    
# dataset = FnBertDataset([inputs[0], inputs[-1]], [labels[0], labels[-1]], frame_dict, tokenizer, bert_model)
dataset = FnBertDataset(inputs, labels, frame_dict, tokenizer, bert_model)

start_time = time.time()
for i in range(10):
    dataset[i]
print("elapsed time (s) to generate one datapoint = ", time.time() - start_time)

print("dataset in = ", dataset[0][0])
print("dataset out = ", dataset[0][1], dataset[0][1].type())
print("dimensions: in =", dataset.INPUT_DIM, " out = ", dataset.OUTPUT_DIM)

elapsed time (s) to generate one datapoint =  1.2686066627502441
dataset in =  tensor([ 0.9563, -0.0470, -0.1990,  ...,  0.0000,  0.0000,  0.0000])
dataset out =  tensor(0) torch.LongTensor
dimensions: in = 2304  out =  17


In [7]:
def create_net(input_dim, output_dim):
    layers = [
        nn.Dropout(),
        nn.Linear(input_dim, 100),
        nn.ReLU(),
        nn.Dropout(),
        nn.Linear(100, output_dim),    
    ]
    model = nn.Sequential(*layers)
    return model

# Run training & testing
net = create_net(input_dim = dataset.INPUT_DIM, output_dim = dataset.OUTPUT_DIM)
model = Model(net, criterion = nn.CrossEntropyLoss(),
              optimizer=optim.Adam(net.parameters(), lr=10e-4))
model.fit(dataset, n_epochs=10, batch_size=32, verbose=True, print_every=2)

[ 1,     2] loss: 1.539
[ 1,     4] loss: 1.182
Epoch  1 finished. Loss: 1.361 (elapsed 4.389s)
[ 2,     2] loss: 0.915
[ 2,     4] loss: 0.829
Epoch  2 finished. Loss: 0.872 (elapsed 4.700s)
[ 3,     2] loss: 0.582
[ 3,     4] loss: 0.334
Epoch  3 finished. Loss: 0.458 (elapsed 4.703s)
[ 4,     2] loss: 0.354
[ 4,     4] loss: 0.157
Epoch  4 finished. Loss: 0.256 (elapsed 4.486s)
[ 5,     2] loss: 0.184
[ 5,     4] loss: 0.072
Epoch  5 finished. Loss: 0.128 (elapsed 4.573s)
[ 6,     2] loss: 0.068
[ 6,     4] loss: 0.140
Epoch  6 finished. Loss: 0.104 (elapsed 5.466s)
[ 7,     2] loss: 0.079
[ 7,     4] loss: 0.027
Epoch  7 finished. Loss: 0.053 (elapsed 6.068s)
[ 8,     2] loss: 0.092
[ 8,     4] loss: 0.016
Epoch  8 finished. Loss: 0.054 (elapsed 5.767s)
[ 9,     2] loss: 0.025
[ 9,     4] loss: 0.042
Epoch  9 finished. Loss: 0.033 (elapsed 6.049s)
[10,     2] loss: 0.047
[10,     4] loss: 0.016
Epoch 10 finished. Loss: 0.032 (elapsed 5.841s)
Training finished (elapsed 52.041s)


In [None]:
torch.save(
    net.state_dict(), 'C:\\Users\\danil\\Documents\\Northwestern\\Research\\projects\\frame_classification\\state_dict_small')
torch.save(
    net, 'C:\\Users\\danil\\Documents\\Northwestern\\Research\\projects\\frame_classification\\net_small')

In [None]:
dev_dataset = FnBertDataset(inputs, labels, frame_dict, tokenizer, bert_model)
# dev_dataset = FnBertDataset([inputs[0], inputs[-1]], [labels[0], labels[-1]], 
#                             frame_dict, tokenizer, bert_model)
model.net.eval()
model.test(dev_dataset)

In [None]:
print(frame_dict)