In [1]:
from __future__ import print_function
import os
import math
import numpy as np
import json
import time
import random
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from make_fn_data import load_fn_data
from neural_net import Model, NpClassDataset
from transformers import BertTokenizer, BertModel, BertForMaskedLM

I0617 14:45:57.437368 26044 file_utils.py:39] PyTorch version 1.2.0+cu92 available.
  from ._conv import register_converters as _register_converters


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()
# bert_model.to('cuda')
print(bert_model.config)

In [2]:
# Load and prepare data
data = load_fn_data()

statistics
# lex units:  13572
# frames:  1073
# data points:  200751
# lex units without data:  3271


In [3]:
# create datapoints from data

frame_dict = {}
frame_dict_rev = {}

inputs = []
labels = []

for lu in data:
    frame =  lu["frame"]
    if not frame in frame_dict.keys():
        frame_dict[frame] = len(frame_dict.keys())
        frame_dict_rev[frame_dict[frame]] = frame
    frame_id = frame_dict[frame]
    
    for sentence in lu["sentences"]:
        text = sentence["text"]
        indexes = sentence["indexes"]
        if len(indexes) > 0:
            start = min([int(i[0]) for i in indexes])
            end = max([int(i[1]) for i in indexes])
            inputs.append((text, start, end))
            labels.append(frame_id)
        
print("# datapoints = ", len(labels))
print("max labels = ", max(labels))
print(len(frame_dict.keys()))

# datapoints =  200750
max labels =  1072
1073


In [7]:
import pprint
pp = pprint.PrettyPrinter(indent=0, depth=6)
pp.pprint(frame_dict)

{'Abandonment': 664,
'Abounding_with': 557,
'Absorb_heat': 338,
'Abundance': 452,
'Abusing': 544,
'Accompaniment': 915,
'Accomplishment': 402,
'Accoutrements': 387,
'Accuracy': 609,
'Achieving_first': 853,
'Active_substance': 292,
'Activity_done_state': 1027,
'Activity_finish': 268,
'Activity_ongoing': 269,
'Activity_pause': 270,
'Activity_prepare': 271,
'Activity_ready_state': 399,
'Activity_resume': 153,
'Activity_start': 487,
'Activity_stop': 261,
'Actually_occurring_entity': 437,
'Addiction': 556,
'Adding_up': 1058,
'Adducing': 451,
'Adjacency': 960,
'Adjusting': 326,
'Adopt_selection': 440,
'Aesthetics': 56,
'Affirm_or_deny': 975,
'Age': 16,
'Aggregate': 152,
'Aging': 619,
'Agree_or_refuse_to_act': 477,
'Agriculture': 656,
'Aiming': 607,
'Alliance': 433,
'Alternatives': 632,
'Amalgamation': 340,
'Amassing': 382,
'Ambient_temperature': 87,
'Ammunition': 424,
'Amounting_to': 206,
'Animals': 951,
'Annoyance': 674,
'Appeal': 1052,
'Appellations': 772,
'Apply_heat': 341,
'Appointing': 

'Go_into_shape': 735,
'Goal': 165,
'Going_back_on_a_commitment': 426,
'Gradable_artistic_quality': 906,
'Gradable_proximity': 961,
'Graph_shape': 796,
'Grasp': 20,
'Grinding': 984,
'Grooming': 253,
'Ground_up': 554,
'Growing_food': 657,
'Guest_and_host': 634,
'Guilt_or_innocence': 113,
'Gusto': 810,
'Hair_configuration': 237,
'Halt': 67,
'Have_as_requirement': 385,
'Have_as_translation_equivalent': 498,
'Have_associated': 633,
'Have_visitor_over': 635,
'Having_or_lacking_access': 584,
'Health_response': 1068,
'Hearsay': 1025,
'Heat_potential': 761,
'Hedging': 971,
'Heralding': 88,
'Hiding_objects': 510,
'Hindering': 186,
'Hiring': 118,
'Historic_event': 613,
'History': 698,
'Hit_or_miss': 608,
'Hit_target': 1050,
'Holding_off_on': 435,
'Hospitality': 638,
'Hostile_encounter': 105,
'Hunting': 653,
'Hunting_success_or_failure': 655,
'Identicality': 74,
'Identity': 990,
'Idiosyncrasy': 35,
'Imitating': 509,
'Immobilization': 734,
'Impact': 195,
'Import_export_scenario': 1048,
'Importance'

In [None]:
# You should build your custom dataset as below.
class FnBertDataset(torch.utils.data.Dataset):
    
    def __init__(self, inputs, labels, frame_dict, tokenizer, bert_model):
        """
        First two arguments should be lists with the format:
        inputs: [(text1, start1, end1), ...]
        labels: [label_id1, ...]
        """
        self.inputs = inputs
        self.labels = labels
        
        self.tokenizer = tokenizer
        self.bert_model = bert_model
        
        self.MAX_LEN = 4
        self.INPUT_DIM = self.MAX_LEN * self.bert_model.config.hidden_size
        self.OUTPUT_DIM = len(frame_dict.keys())
        
    def __getitem__(self, index):
        text, start, end = self.inputs[index]
        x = self.get_bert_hidden_state(text, start, end)
        y = torch.tensor(self.labels[index]).long()        
        return x, y
        
    def __len__(self):
        return len(self.labels)
    
    def get_bert_hidden_state(self, text, start, end):
        text = "[CLS] " + text + " [SEP]"
        start += len("[CLS] ")
        end += len("[CLS] ")
        
        # Compute start end end using token indexes
        tk_start, tk_end = self.pos_to_token_idx(text, start, end)
        tk_end = min(tk_start + self.MAX_LEN, tk_end)
        # Tokenize input
        tokenized_text = self.tokenizer.tokenize(text)
    
        # Convert token to vocabulary indices
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        # Convert inputs to PyTorch tensors
        
#         tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
        tokens_tensor = torch.tensor([indexed_tokens])
        
        # Predict hidden states features for each layer
        with torch.no_grad():
            outputs = self.bert_model(tokens_tensor)
            # Hidden state of the last layer of the Bert model
            hidden = torch.squeeze(outputs[0], dim = 0)
            # Slice hidden state to hidden[start:end]
            hidden = hidden.narrow(0, tk_start, tk_end-tk_start)
            # Add padding
            pad = torch.zeros(self.MAX_LEN, hidden.size()[1])            
            pad[0:hidden.size()[0],:] = hidden
            hidden = torch.flatten(pad)
            return hidden

    def pos_to_token_idx(self, text, start, end):
        target_prefix = self.tokenizer.tokenize(text[:start])
        target = self.tokenizer.tokenize(text[start:end+1])
        tk_start = len(target_prefix)
        tk_end = tk_start + len(target)
        return tk_start, tk_end
    
dataset = FnBertDataset(inputs, labels, frame_dict, tokenizer, bert_model)
print("dataset in = ", dataset[100][0])
print("dataset out = ", dataset[100][1], dataset[100][1].type())
print("dimensions: in =", dataset.INPUT_DIM, " out = ", dataset.OUTPUT_DIM)

In [None]:
def create_net(input_dim, output_dim):
    layers = [
        nn.Dropout(),
        nn.Linear(input_dim, 400),
        nn.ReLU(),
        nn.Dropout(),
        nn.Linear(400, output_dim),
    ]
    model = nn.Sequential(*layers)
    return model

# Run training & testing
net = create_net(input_dim = dataset.INPUT_DIM, output_dim = dataset.OUTPUT_DIM)

net = net.cpu()
net.load_state_dict(torch.load('C:\\Users\\danil\\Documents\\Northwestern\\Research\\projects\\frame_classification\\state_dict_3'))

In [None]:
model = Model(net, criterion = nn.CrossEntropyLoss(),
              optimizer=optim.Adam(net.parameters(), lr=10e-5))

In [None]:
# model.fit(dataset, n_epochs=10, batch_size=32, verbose=True)

In [None]:
# torch.save(
#     net.state_dict(), 'C:\\Users\\danil\\Documents\\Northwestern\\Research\\projects\\frame_classification\\state_dict_5')
# torch.save(
#     net, 'C:\\Users\\danil\\Documents\\Northwestern\\Research\\projects\\frame_classification\\net_5')

In [None]:
dev_idxs = random.choices(range(len(inputs)), k=1000)
dev_inputs = [inputs[idx] for idx in dev_idxs]
dev_labels = [labels[idx] for idx in dev_idxs]

net.eval()
dev_dataset = FnBertDataset(dev_inputs, dev_labels, frame_dict, tokenizer, bert_model)
print("length of dev set: ", len(dev_dataset))
model.test(dev_dataset)

In [None]:
def predict_top_k_dataset(dataset, k, batch_size=1):
        predicted_lst = []
        probs_lst = []
        data_loader = torch.utils.data.DataLoader(
            dataset=dataset, batch_size=batch_size, shuffle=False)    
        with torch.no_grad():
            for (inputs, _) in data_loader:
                inputs = inputs.to("cuda")
                predicted, probs = predict_top_k(inputs, k)
                predicted_lst.append(predicted)
                probs_lst.append(probs)
        predicted_tensor = torch.cat(predicted_lst, 0)
        probs_tensor = torch.cat(probs_lst, 0)
        return predicted_tensor, probs_tensor
    
def predict_top_k(inputs, k, batch_size=1):
    inputs = inputs.to("cuda")
    with torch.no_grad():
        outputs = net(inputs)
        logits, predicted = torch.topk(outputs.data, k, dim = 1)
        softmax = nn.Softmax(dim=1)
        probs = softmax(logits)
        return predicted, probs

In [None]:
dev_in = [
    ("the problem is telling which is the original document and which the copy", 68, 71),
    ("the cause of the accident is not clear", 4, 8),
    ("Rubella, also known as German measles or three-day measles, is an infection caused by the rubella virus.", 0, 6),
    ("he died after a long illness", 21, 27),
    ("for a time revolution was a strong probability", 35, 45),
]
dev_lab = [
    frame_dict["Duplication"], frame_dict["Causation"], 
    frame_dict["Medical_conditions"], frame_dict["Medical_conditions"],
    frame_dict["Probability"]
]
dev_dataset = FnBertDataset(dev_in, dev_lab, frame_dict, tokenizer, bert_model)
preds, probs = predict_top_k_dataset(dev_dataset, 5)
preds = preds.tolist()
probs = probs.tolist()
for pred, prob in zip(preds, probs):
    print([(frame_dict_rev[x], round(y, 2)) for x, y in zip(pred, prob)])