In [1]:
import wget
import os

url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

BASE_DIR = '' # Working directory
DATA_DIR = f'{BASE_DIR}data/' # Data directory
MODELS_DIR = f'{BASE_DIR}models/' # Models directory

# Download the file (if we haven't already)
if not os.path.exists(f'{BASE_DIR}/data/cola_public_1.1.zip'):
    wget.download(url, f'{BASE_DIR}/data/cola_public_1.1.zip')
    
if not os.path.exists(f'{BASE_DIR}/data/cola_public/'):
    !unzip /user/HS229/gz00109/data/cola_public_1.1.zip -d /user/HS229/gz00109/data/

# Load data

In [3]:
import pandas as pd
# Load the dataset into a pandas dataframe.
df = pd.read_csv(f"{DATA_DIR}/cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)

Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
7838,ad03,1,,He kicked him
8033,ad03,1,,Euclid was interested in Plato's description o...
4879,ks08,1,,has no relative pronoun at all.
1253,r-67,1,,You're going to hurt yourself one of these days.
2122,rhl07,1,,I sent the salesman to the devil.
6608,g_81,1,,"Smith loaned, and his widow later donated, a v..."
4157,ks08,0,*,Fifteen dollars in a week are not much.
4508,ks08,1,,"Mary sang a song, but Lee did not."
8238,ad03,1,,Look after yourself!
2942,l-93,1,,Donna fixed a sandwich for me.


In [4]:
sentences = list(df.sentence.values)
labels = df.label.values

# Define model

In [23]:
from transformers import RobertaForSequenceClassification, AdamW
roberta_base = RobertaForSequenceClassification.from_pretrained('roberta-base').cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

# Prepare data

In [24]:
import torch
from transformers import RobertaTokenizerFast
from torch.utils.data import Dataset, random_split

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
device = torch.device('cuda')

In [25]:
encoded_dict = tokenizer(sentences, truncation=True, padding=True)

In [26]:
class ColaDataset(Dataset):
    def __init__(self, encodings, labels, limit=None):
        self.encodings = {key: encodings[key][:limit] for key in encodings}
        self.labels = labels[:limit]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], device=device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
cola_dataset = ColaDataset(encoded_dict, labels)

## Train and validation set

In [27]:
train_size = int(0.8 * len(cola_dataset))
val_size = int(0.1 * len(cola_dataset))
test_size = len(cola_dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(cola_dataset, [train_size, val_size, test_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} test samples'.format(test_size))

6,840 training samples
  855 validation samples
  856 test samples


In [28]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [29]:
batch_size = 32

train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size=batch_size 
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler = SequentialSampler(val_dataset),
    batch_size=batch_size 
)

test_dataloader = DataLoader(
    test_dataset,
    sampler = SequentialSampler(test_dataset),
    batch_size=batch_size 
)

# Prepare training

In [32]:
import pytorch_lightning as pl
import torch.nn.functional as F
from pytorch_lightning.metrics import functional as FM
from pytorch_lightning import loggers as pl_loggers

class LMClassifier(pl.LightningModule):
    
    def __init__(self, model, tokenizer, labels):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.labels = labels
        self.valid_acc = pl.metrics.Accuracy()
        self.train_acc = pl.metrics.Accuracy()
    
    def forward(self, x):
        self.model.eval()
        self.model.cuda()
        
        input_ids = tokenizer.encode(x, return_tensors='pt').to(device)
        outputs = self.model(input_ids)
        prob = F.softmax(outputs.logits.detach(), dim=1).cpu().numpy()[0].tolist()
        
        return {label: prob[index]  for index, label in enumerate(self.labels)}
            
    def training_step(self, batch, batch_idx):
        labels = batch["labels"]
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = None

        outputs = self.model(**batch)
        
        labels_hat = torch.argmax(outputs.logits, dim=1)        
        
        self.log('train_loss', outputs.loss)        
        self.train_acc(labels_hat, batch['labels'])
        self.log('train_acc', self.train_acc, on_epoch=True, prog_bar=True)  
        
        return outputs.loss
    
    
    def validation_step(self, batch, batch_idx):  
        outputs = self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
            
        labels_hat = torch.argmax(outputs.logits, dim=1)
        
        val_acc = self.valid_acc(labels_hat, batch['labels'])
        self.log('valid_acc', val_acc, on_step=True, on_epoch=True, prog_bar=True)    
 
    
    def test_step(self, batch, batch_idx):   
        outputs = self.model(input_ids=batch["input_ids"])
        
        labels_hat = torch.argmax(outputs.logits, dim=1)
        acc = FM.accuracy(labels_hat, batch['labels'])

        metrics = {'test_acc': acc}
        self.log_dict(metrics)
        return metrics
    
    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr = 2e-5, eps = 1e-8)
        return optimizer
    
cola_model = LMClassifier(roberta_base, tokenizer, ['wrong', 'correct'])

tb_logger = pl_loggers.TensorBoardLogger('cola-logs/')
trainer = pl.Trainer(gpus=1, max_epochs=5, checkpoint_callback=False, logger=tb_logger)
trainer.fit(cola_model, train_dataloader, validation_dataloader)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores

  | Name      | Type                             | Params
---------------------------------------------------------------
0 | model     | RobertaForSequenceClassification | 124 M 
1 | valid_acc | Accuracy                         | 0     
2 | train_acc | Accuracy                         | 0     
---------------------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.589   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [33]:
cola_model = LMClassifier(roberta_base, tokenizer, ['wrong', 'correct'])
test_trainer = pl.Trainer(gpus=1)
test_trainer.test(cola_model, test_dataloaders=test_dataloader)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8333333134651184}
--------------------------------------------------------------------------------


[{'test_acc': 0.8333333134651184}]

In [34]:
roberta_base.save_pretrained(MODELS_DIR+'roberta-cola-v1')