In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip -d ./ /content/drive/MyDrive/Partitive-Files.zip

Archive:  /content/drive/MyDrive/Partitive-Files.zip
   creating: ./Partitive-Files/
  inflating: ./__MACOSX/._Partitive-Files  
  inflating: ./Partitive-Files/part-training  
  inflating: ./__MACOSX/Partitive-Files/._part-training  
  inflating: ./Partitive-Files/part-test  
  inflating: ./__MACOSX/Partitive-Files/._part-test  
  inflating: ./Partitive-Files/README.txt  
  inflating: ./__MACOSX/Partitive-Files/._README.txt  
  inflating: ./Partitive-Files/part-dev  
  inflating: ./__MACOSX/Partitive-Files/._part-dev  
  inflating: ./Partitive-Files/%-dev  
  inflating: ./__MACOSX/Partitive-Files/._%-dev  
  inflating: ./Partitive-Files/%-training  
  inflating: ./__MACOSX/Partitive-Files/._%-training  
  inflating: ./Partitive-Files/%-test  
  inflating: ./__MACOSX/Partitive-Files/._%-test  


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 39.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [4]:
import pandas as pd
import nltk
import re

In [5]:
def construct_dataset(filename):
    corpus = []
    sentence = []
    labels = []
    i = 0

    with open(filename, 'r') as f:
        for line in f.readlines():
            line = line.strip('\n')

            if line:
                word = line.split()[0]
                sentence.append(word)
                if line.split()[-1] == 'ARG1':
                    labels.append(i) 
                i += 1
            else:
                sentence = ' '.join(sentence)
                sentence = sentence.replace('COMMA', ',')
                sentence = re.sub(r'[^\w\s]', '', sentence).strip()
                sentence = re.sub(r' +', ' ', sentence)

                corpus.append(sentence)
                sentence = []
                i = 0
        return corpus, labels

In [6]:
X_train, y_train = construct_dataset('Partitive-Files/%-training')
X_dev, y_dev = construct_dataset('Partitive-Files/%-dev')
X_test, y_test = construct_dataset('Partitive-Files/%-test')

In [7]:
#load pretrained bert base model
from transformers import BertModel
bert_model = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

nltk.download('punkt')

class MyDataset(Dataset):

    def __init__(self, corpus, labels, maxlen):

        self.corpus = corpus
        self.labels = labels
        
        # initialize tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.maxlen = maxlen

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, index):

        sentence = self.corpus[index]
        label = self.labels[index]

        tokens = self.tokenizer.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] 
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]']

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens_ids_tensor = torch.tensor(tokens_ids)

        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
from torch.utils.data import DataLoader

train_set = MyDataset(X_train, y_train, maxlen = 365)
dev_set = MyDataset(X_dev, y_dev, maxlen = 365)

train_loader = DataLoader(train_set, batch_size = 16, num_workers = 2)
dev_loader = DataLoader(dev_set, batch_size = 16, num_workers = 2)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [10]:
import torch
import torch.nn as nn
from transformers import BertModel

class Arg1Classifier(nn.Module):

    def __init__(self):
        super(Arg1Classifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        self.cls_layer = nn.Linear(768, 365)

    def forward(self, seq, attn_masks):

        outputs = self.bert_layer(seq, attention_mask = attn_masks)
        cont_reps = outputs.last_hidden_state

        cls_rep = cont_reps[:, 0]

        logits = self.cls_layer(cls_rep)

        return logits

In [11]:
gpu = 0

print("Creating classifier, initialised with pretrained BERT-BASE parameters...")
net = Arg1Classifier()
net.cuda(gpu)
print("Done creating classifier.")

Creating classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating classifier.


In [12]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [13]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    best_net = None
    for ep in range(max_eps):
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):

            opti.zero_grad()  

            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            logits = net(seq, attn_masks)

            loss = criterion(logits.squeeze(-1), labels)

            loss.backward()

            opti.step()
              
            if it % 10 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep+1, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep+1, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            best_net = net
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep+1))
    return best_net

In [14]:
def get_accuracy_from_logits(logits, labels):
    preds = logits.argmax(dim=1)
    acc = (preds == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [15]:
num_epoch = 5

best_net = train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

Iteration 0 of epoch 1 complete. Loss: 5.9855475425720215; Accuracy: 0.0; Time taken (s): 2.3719849586486816
Iteration 10 of epoch 1 complete. Loss: 5.733220100402832; Accuracy: 0.0; Time taken (s): 19.865821599960327
Iteration 20 of epoch 1 complete. Loss: 5.072379112243652; Accuracy: 0.125; Time taken (s): 20.2315673828125
Iteration 30 of epoch 1 complete. Loss: 4.806010723114014; Accuracy: 0.0625; Time taken (s): 20.26855182647705
Iteration 40 of epoch 1 complete. Loss: 4.328001022338867; Accuracy: 0.25; Time taken (s): 20.188449382781982
Iteration 50 of epoch 1 complete. Loss: 4.079409122467041; Accuracy: 0.3125; Time taken (s): 20.16918659210205
Iteration 60 of epoch 1 complete. Loss: 3.8072619438171387; Accuracy: 0.25; Time taken (s): 20.10268545150757
Iteration 70 of epoch 1 complete. Loss: 3.627598762512207; Accuracy: 0.0625; Time taken (s): 20.156567573547363
Iteration 80 of epoch 1 complete. Loss: 4.1559529304504395; Accuracy: 0.0625; Time taken (s): 20.096282720565796
Iterat

In [17]:
test_set = MyDataset(X_test, y_test, maxlen = 365)
test_loader = DataLoader(test_set, batch_size = 16, num_workers = 2)

In [18]:
test_acc, test_loss = evaluate(best_net, criterion, test_loader, gpu)

In [19]:
print("Test Accuracy: {}; Test Loss: {}".format(test_acc, test_loss))

Test Accuracy: 0.30416667461395264; Test Loss: 2.879079556465149
