In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [2]:
df1 = pd.read_csv("verb_veridicality.csv")
df1.head()

Unnamed: 0.1,Unnamed: 0,Verb,Signature
0,0,ensure,+/o
1,1,prove,o/o
2,2,notice,+/+
3,3,plan,o/o
4,4,attempt,o/-


In [3]:
df2 = pd.read_csv("simple_implicatives.csv")
df2.head()

Unnamed: 0.1,Unnamed: 0,Verb,Signature
0,0,add,np*
1,1,admit,pp
2,2,allow,pp|nn
3,3,allow,pp|nn
4,4,appoint,pp*


In [4]:
sentences = list(df1['Verb'])
sentences[0]

'ensure'

In [5]:
signs = list(df1['Signature'])
print(set(signs))
labels = []
tag2idx = {t: i for i, t in enumerate(set(signs))}
for i in range(len(signs)):
    labels.append(int(tag2idx[signs[i]]))
labels[0]

{'o/+', '+/-', 'o/o', '+/+', '-/o', 'o/-', '-/+', '+/o'}


7

In [6]:
from transformers import RobertaTokenizer

MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [7]:
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer

class SignatureData(Dataset):
    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [25]:
train_size = 0.1
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels =  train_test_split(
    sentences, labels,
    test_size=0.1, random_state=200)

print("FULL Dataset: {}".format(len(sentences)))
print("TRAIN Dataset: {}".format(len(train_data)))
print("TEST Dataset: {}".format(len(test_data)))

training_set = SignatureData(sentences, labels, tokenizer, MAX_LEN)
testing_set = SignatureData(sentences, labels, tokenizer, MAX_LEN)

FULL Dataset: 126
TRAIN Dataset: 113
TEST Dataset: 13


In [26]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
import torch

class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.01)
        self.classifier = torch.nn.Linear(768, 8)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [11]:
import os
os.environ['CUDA_VISIBLE_DEVICES']= '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
MAX_LEN = 75
bs = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla K80'

In [12]:
model = RobertaClass()
model.to(device)

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, eleme

In [13]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [14]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [15]:
import logging
logging.basicConfig(level=logging.ERROR)

EPOCHS = 20
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Training Loss per 5000 steps: 2.210535764694214
Training Accuracy per 5000 steps: 0.0


16it [00:12,  1.26it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 0: 13.492063492063492
Training Loss Epoch: 2.1122675389051437
Training Accuracy Epoch: 13.492063492063492
Training Loss per 5000 steps: 2.019711494445801
Training Accuracy per 5000 steps: 37.5


16it [00:12,  1.26it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 1: 65.07936507936508
Training Loss Epoch: 1.6062255203723907
Training Accuracy Epoch: 65.07936507936508
Training Loss per 5000 steps: 1.3854578733444214
Training Accuracy per 5000 steps: 62.5


16it [00:12,  1.26it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 2: 65.07936507936508
Training Loss Epoch: 1.2839897610247135
Training Accuracy Epoch: 65.07936507936508
Training Loss per 5000 steps: 1.8938467502593994
Training Accuracy per 5000 steps: 37.5


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 3: 65.07936507936508
Training Loss Epoch: 1.2618105337023735
Training Accuracy Epoch: 65.07936507936508
Training Loss per 5000 steps: 1.6337378025054932
Training Accuracy per 5000 steps: 37.5


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 4: 65.07936507936508
Training Loss Epoch: 1.232622642070055
Training Accuracy Epoch: 65.07936507936508
Training Loss per 5000 steps: 0.44784045219421387
Training Accuracy per 5000 steps: 100.0


16it [00:12,  1.26it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 5: 65.07936507936508
Training Loss Epoch: 1.2297938987612724
Training Accuracy Epoch: 65.07936507936508
Training Loss per 5000 steps: 1.7446600198745728
Training Accuracy per 5000 steps: 37.5


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 6: 65.07936507936508
Training Loss Epoch: 1.204610574990511
Training Accuracy Epoch: 65.07936507936508
Training Loss per 5000 steps: 1.1666392087936401
Training Accuracy per 5000 steps: 62.5


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 7: 65.07936507936508
Training Loss Epoch: 1.178649928420782
Training Accuracy Epoch: 65.07936507936508
Training Loss per 5000 steps: 1.091200590133667
Training Accuracy per 5000 steps: 62.5


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 8: 65.07936507936508
Training Loss Epoch: 1.1041384376585484
Training Accuracy Epoch: 65.07936507936508
Training Loss per 5000 steps: 0.6594831943511963
Training Accuracy per 5000 steps: 75.0


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 9: 68.25396825396825
Training Loss Epoch: 1.0236545521765947
Training Accuracy Epoch: 68.25396825396825
Training Loss per 5000 steps: 0.9893589019775391
Training Accuracy per 5000 steps: 75.0


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 10: 76.19047619047619
Training Loss Epoch: 0.8819996118545532
Training Accuracy Epoch: 76.19047619047619
Training Loss per 5000 steps: 0.8819599151611328
Training Accuracy per 5000 steps: 75.0


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 11: 75.39682539682539
Training Loss Epoch: 0.7644772073253989
Training Accuracy Epoch: 75.39682539682539
Training Loss per 5000 steps: 0.3529694676399231
Training Accuracy per 5000 steps: 87.5


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 12: 78.57142857142857
Training Loss Epoch: 0.6775618819519877
Training Accuracy Epoch: 78.57142857142857
Training Loss per 5000 steps: 0.697538435459137
Training Accuracy per 5000 steps: 75.0


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 13: 80.15873015873017
Training Loss Epoch: 0.6103034969419241
Training Accuracy Epoch: 80.15873015873017
Training Loss per 5000 steps: 0.5638023018836975
Training Accuracy per 5000 steps: 75.0


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 14: 83.33333333333333
Training Loss Epoch: 0.527457807213068
Training Accuracy Epoch: 83.33333333333333
Training Loss per 5000 steps: 0.45809826254844666
Training Accuracy per 5000 steps: 100.0


16it [00:12,  1.26it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 15: 86.5079365079365
Training Loss Epoch: 0.4827149142511189
Training Accuracy Epoch: 86.5079365079365
Training Loss per 5000 steps: 0.5541451573371887
Training Accuracy per 5000 steps: 87.5


16it [00:12,  1.25it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 16: 87.3015873015873
Training Loss Epoch: 0.42897267010994256
Training Accuracy Epoch: 87.3015873015873
Training Loss per 5000 steps: 0.22679319977760315
Training Accuracy per 5000 steps: 100.0


16it [00:12,  1.26it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 17: 88.0952380952381
Training Loss Epoch: 0.44432959938421845
Training Accuracy Epoch: 88.0952380952381
Training Loss per 5000 steps: 0.5220338106155396
Training Accuracy per 5000 steps: 75.0


16it [00:12,  1.26it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 18: 93.65079365079364
Training Loss Epoch: 0.37280168291181326
Training Accuracy Epoch: 93.65079365079364
Training Loss per 5000 steps: 0.34245386719703674
Training Accuracy per 5000 steps: 87.5


16it [00:12,  1.26it/s]

The Total Accuracy for Epoch 19: 94.44444444444444
Training Loss Epoch: 0.3339135004207492
Training Accuracy Epoch: 94.44444444444444





In [38]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            print(big_idx.cpu().detach().numpy())
            print(targets.cpu().detach().numpy())
            n_correct += calcuate_accuracy(big_idx, targets)

            #nb_tr_steps += 1
            #nb_tr_examples+=targets.size(0)
            
            #if _%5000==0:
            #    loss_step = tr_loss/nb_tr_steps
            #    accu_step = (n_correct*100)/nb_tr_examples
            #    print(f"Validation Loss per 100 steps: {loss_step}")
            #    print(f"Validation Accuracy per 100 steps: {accu_step}")
    #epoch_loss = tr_loss/nb_tr_steps
    #epoch_accu = (n_correct*100)/nb_tr_examples
    #print(f"Validation Loss Epoch: {epoch_loss}")
    #print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    #return epoch_accu


In [39]:
valid(model, testing_loader)

2it [00:00, 11.13it/s]

[0 2 2 2]
[0 2 2 2]
[2 2 2 3]
[2 2 2 3]
[7 0 1 2]
[7 0 1 2]


6it [00:00, 11.84it/s]

[7 0 3 2]
[7 0 3 2]
[2 1 3 2]
[2 1 3 2]
[2 2 2 3]
[2 0 2 3]


8it [00:00, 12.41it/s]

[2 3 2 2]
[2 3 2 2]
[2 2 2 3]
[2 2 2 3]
[7 2 1 2]
[6 2 1 2]


12it [00:00, 13.57it/s]

[2 0 7 2]
[2 5 7 2]
[3 2 2 1]
[3 2 2 1]
[2 7 2 2]
[2 7 2 2]
[2 2 2 3]
[2 2 2 3]


16it [00:01, 14.40it/s]

[2 3 2 2]
[2 3 2 2]
[3 2 1 2]
[3 2 6 2]
[2 0 7 2]
[2 0 7 2]
[2 3 2 2]
[2 3 2 2]


20it [00:01, 14.81it/s]

[2 2 2 4]
[2 2 2 4]
[1 2 2 2]
[1 2 2 2]
[2 7 2 2]
[2 7 2 2]
[1 2 2 2]
[1 2 2 2]


24it [00:01, 15.10it/s]

[2 2 2 3]
[2 2 2 1]
[4 2 1 2]
[4 2 1 2]
[2 4 2 7]
[2 4 2 7]
[2 2 2 7]
[2 2 2 7]


28it [00:01, 15.24it/s]

[2 2 3 2]
[2 2 3 2]
[2 2 2 2]
[2 2 2 2]
[2 0 3 2]
[2 0 3 2]
[3 2 2 2]
[3 2 2 2]


32it [00:02, 14.54it/s]

[2 3 2 2]
[2 3 2 2]
[1 2 2 2]
[1 2 2 2]
[2 2]
[2 2]





In [20]:
output_model_file = 'roberta_implication.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

('./vocab.json', './merges.txt')