In [1]:
from __future__ import print_function
import math
import numpy as np
import json
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from make_fn_data import load_fn_data
from neural_net import Model, NpClassDataset
from transformers import BertTokenizer, BertModel, BertForMaskedLM

I0614 00:57:35.926011  1796 file_utils.py:39] PyTorch version 1.2.0+cu92 available.
  from ._conv import register_converters as _register_converters


In [2]:
# Load and prepare data
data = load_fn_data()

statistics
# data points:  200751
# lex units without data:  3271


In [33]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()
bert_model.to('cuda')
print(bert_model.config)

I0615 01:01:55.699022  1796 tokenization_utils.py:1075] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\danil/.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0615 01:01:55.942373  1796 configuration_utils.py:265] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at C:\Users\danil/.cache\torch\transformers\4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
I0615 01:01:55.943370  1796 configuration_utils.py:301] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



In [37]:
# create datapoints from data

frame_dict = {}
frame_dict_rev = {}

inputs = []
labels = []

for lu in data[:5000]:
    frame =  lu["frame"]
    if not frame in frame_dict.keys():
        frame_dict[frame] = len(frame_dict.keys())
        frame_dict_rev[frame_dict[frame]] = frame
    frame_id = frame_dict[frame]
    
    for sentence in lu["sentences"]:
        text = sentence["text"]
        indexes = sentence["indexes"]        
        start = min([int(i[0]) for i in indexes])
        end = max([int(i[1]) for i in indexes])
        inputs.append((text, start, end))
        labels.append(frame_id)
        
print("# datapoints = ", len(labels))
print("max labels = ", max(labels))
print(len(frame_dict.keys()))

# datapoints =  62445
max labels =  828
829


In [39]:
# You should build your custom dataset as below.
class FnBertDataset(torch.utils.data.Dataset):
    
    def __init__(self, inputs, labels, frame_dict, tokenizer, bert_model):
        """
        First two arguments should be lists with the format:
        inputs: [(text1, start1, end1), ...]
        labels: [label_id1, ...]
        """
        self.inputs = inputs
        self.labels = labels
        
        self.tokenizer = tokenizer
        self.bert_model = bert_model
        
        self.MAX_LEN = 4
        self.INPUT_DIM = self.MAX_LEN * self.bert_model.config.hidden_size
        self.OUTPUT_DIM = len(frame_dict.keys())
        
    def __getitem__(self, index):
        text, start, end = self.inputs[index]
        x = self.get_bert_hidden_state(text, start, end)
        y = torch.tensor(self.labels[index]).long()        
        return x, y
        
    def __len__(self):
        return len(self.labels)
    
    def get_bert_hidden_state(self, text, start, end):
        text = "[CLS] " + text + " [SEP]"
        start += len("[CLS] ")
        end += len("[CLS] ")
        
        # Compute start end end using token indexes
        tk_start, tk_end = self.pos_to_token_idx(text, start, end)
        tk_end = min(tk_start + self.MAX_LEN, tk_end)
        # Tokenize input
        tokenized_text = self.tokenizer.tokenize(text)
    
        # Convert token to vocabulary indices
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
        # Predict hidden states features for each layer
        with torch.no_grad():
            outputs = self.bert_model(tokens_tensor)
            # Hidden state of the last layer of the Bert model
            hidden = torch.squeeze(outputs[0], dim = 0)
            # Slice hidden state to hidden[start:end]
            hidden = hidden.narrow(0, tk_start, tk_end-tk_start)
            # Add padding
            pad = torch.zeros(self.MAX_LEN, hidden.size()[1])            
            pad[0:hidden.size()[0],:] = hidden
            hidden = torch.flatten(pad)
            return hidden

    def pos_to_token_idx(self, text, start, end):
        target_prefix = self.tokenizer.tokenize(text[:start])
        target = self.tokenizer.tokenize(text[start:end+1])
        tk_start = len(target_prefix)
        tk_end = tk_start + len(target)
        return tk_start, tk_end
    
dataset = FnBertDataset(inputs, labels, frame_dict, tokenizer, bert_model)
print("dataset in = ", dataset[100][0])
print("dataset out = ", dataset[100][1], dataset[100][1].type())
print("dimensions: in =", dataset.INPUT_DIM, " out = ", dataset.OUTPUT_DIM)

dataset in =  tensor([-0.2417, -0.4016,  0.5359,  ...,  0.0000,  0.0000,  0.0000])
dataset out =  tensor(4) torch.LongTensor
dimensions: in = 3072  out =  829


In [41]:
def create_net(input_dim, output_dim):
    layers = [
        nn.Linear(input_dim, 200),
        nn.ReLU(),
        nn.Linear(200, output_dim),
    ]
    model = nn.Sequential(*layers)
    return model

# Run training & testing
net = create_net(input_dim = dataset.INPUT_DIM, output_dim = dataset.OUTPUT_DIM)
model = Model(net, criterion = nn.CrossEntropyLoss(),
              optimizer=optim.Adam(net.parameters(), lr=10e-4))
model.fit(dataset, n_epochs=10, batch_size=32, verbose=True)

[ 1,    20] loss: 5.960
[ 1,    40] loss: 4.943
[ 1,    60] loss: 4.548
[ 1,    80] loss: 4.096
[ 1,   100] loss: 3.701
[ 1,   120] loss: 3.250
[ 1,   140] loss: 2.935
[ 1,   160] loss: 2.655
[ 1,   180] loss: 2.268
[ 1,   200] loss: 2.133
[ 1,   220] loss: 1.843
[ 1,   240] loss: 1.662
[ 1,   260] loss: 1.471
[ 1,   280] loss: 1.386
[ 1,   300] loss: 1.276
[ 1,   320] loss: 1.134
[ 1,   340] loss: 1.065
[ 1,   360] loss: 1.022
[ 1,   380] loss: 1.028
[ 1,   400] loss: 0.909
[ 1,   420] loss: 0.987
[ 1,   440] loss: 0.816
[ 1,   460] loss: 0.798
[ 1,   480] loss: 0.765
[ 1,   500] loss: 0.706
[ 1,   520] loss: 0.756
[ 1,   540] loss: 0.664
[ 1,   560] loss: 0.635
[ 1,   580] loss: 0.643
[ 1,   600] loss: 0.582
[ 1,   620] loss: 0.615
[ 2,    20] loss: 0.475
[ 2,    40] loss: 0.411
[ 2,    60] loss: 0.427
[ 2,    80] loss: 0.379
[ 2,   100] loss: 0.364
[ 2,   120] loss: 0.365
[ 2,   140] loss: 0.405
[ 2,   160] loss: 0.427
[ 2,   180] loss: 0.359
[ 2,   200] loss: 0.360
[ 2,   220] loss

In [42]:
torch.save(net.state_dict(), 'C:\\Users\\danil\\Documents\\Northwestern\\Research\\projects\\frame_classification\\state_dict')
torch.save(net, 'C:\\Users\\danil\\Documents\\Northwestern\\Research\\projects\\frame_classification\\net')

In [43]:
dev_dataset = FnBertDataset(inputs[500:1000], labels[500:1000], frame_dict, tokenizer, bert_model)
print(len(dev_dataset))
model.test(dev_dataset)

500
Accuracy on the 500 data points: 97 %


In [47]:
print(frame_dict.keys())
print(frame_dict.keys())
print(inputs[0], frame_dict_rev[labels[0]])

dict_keys(['Duplication', 'Body_movement', 'Topic', 'Reliance', 'Revenge', 'Reliance_on_expectation', 'Recovery', 'Emptying', 'Law', 'Being_located', 'Practice', 'Speed_description', 'Taking_time', 'Shaped_part', 'Architectural_part', 'Filling', 'Age', 'People_by_age', 'People_by_origin', 'Losing_it', 'Grasp', 'Craft', 'Building_subparts', 'Change_event_time', 'Attempt_suasion', 'Perception_body', 'Cause_change_of_position_on_a_scale', 'Conduct', 'Type', 'Body_mark', 'Perception_experience', 'Departing', 'Desirability', 'Gizmo', 'Contingency', 'Idiosyncrasy', 'Moving_in_place', 'Cause_to_make_noise', 'Education_teaching', 'Communication', 'Excreting', 'Capability', 'Motion_noise', 'Rest', 'Increment', 'Political_locales', 'Emotion_active', 'Change_of_leadership', 'Text', 'Natural_features', 'Attempt', 'Perception_active', 'Sensation', 'Partitive', 'Evoking', 'Locale_by_use', 'Aesthetics', 'Noise_makers', 'Request', 'Rite', 'Religious_belief', 'People_by_religion', 'Social_event', 'Buil

In [84]:
def predict_top_k_dataset(dataset, k, batch_size=1):
        predicted_lst = []
        probs_lst = []
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset, batch_size=batch_size, shuffle=False)    
        with torch.no_grad():
            for (inputs, _) in data_loader:
                inputs = inputs.to("cuda")
                predicted, probs = predict_top_k(inputs, k)
                predicted_lst.append(predicted)
                probs_lst.append(probs)
        predicted_tensor = torch.cat(predicted_lst, 0)
        probs_tensor = torch.cat(probs_lst, 0)
        return predicted_tensor, probs_tensor
    
def predict_top_k(inputs, k, batch_size=1):
    inputs = inputs.to("cuda")
    with torch.no_grad():
        outputs = net(inputs)
        logits, predicted = torch.topk(outputs.data, k, dim = 1)
        softmax = nn.Softmax(dim=1)
        probs = softmax(logits)
        return predicted, probs

In [85]:
dev_in = [
    ("the problem is telling which is the original document and which the copy", 68, 71),
    ("the cause of the accident is not clear", 4, 8),
    ("Rubella, also known as German measles or three-day measles, is an infection caused by the rubella virus.", 0, 6),
    ("he died after a long illness", 21, 27),
    ("for a time revolution was a strong probability", 35, 45),
]
dev_lab = [
    frame_dict["Duplication"], frame_dict["Causation"], 
    frame_dict["Medical_conditions"], frame_dict["Medical_conditions"],
    frame_dict["Probability"]
]
dev_dataset = FnBertDataset(dev_in, dev_lab, frame_dict, tokenizer, bert_model)
preds, probs = predict_top_k_dataset(dev_dataset, 5)
preds = preds.tolist()
probs = probs.tolist()
for pred, prob in zip(preds, probs):
    print([(frame_dict_rev[x], round(y, 2)) for x, y in zip(pred, prob)])

[('Duplication', 0.68), ('Correctness', 0.16), ('Spelling_and_pronouncing', 0.1), ('Forging', 0.04), ('Location_in_time', 0.02)]
[('Causation', 0.59), ('Explaining_the_facts', 0.34), ('Location_in_time', 0.06), ('Destiny', 0.01), ('Purpose', 0.01)]
[('Weapon', 0.26), ('Conduct', 0.21), ('People_by_age', 0.19), ('Misdeed', 0.18), ('Measure_linear_extent', 0.15)]
[('Medical_conditions', 0.53), ('Posture', 0.14), ('Individual_history', 0.13), ('Locale_by_use', 0.13), ('Timespan', 0.07)]
[('Opinion', 0.58), ('Probability', 0.23), ('Awareness', 0.08), ('Military', 0.06), ('Mental_property', 0.04)]
