In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 4.6MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 23.3MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 31.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K 

In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
def get_data():
    ! wget https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
    ! mkdir data
    ! unzip -q cola_public_1.1.zip -d ./data
get_data()

--2020-09-12 19:31:16--  https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
Resolving nyu-mll.github.io (nyu-mll.github.io)... 185.199.108.153, 185.199.111.153, 185.199.109.153, ...
Connecting to nyu-mll.github.io (nyu-mll.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 255330 (249K) [application/zip]
Saving to: ‘cola_public_1.1.zip’


2020-09-12 19:31:17 (7.22 MB/s) - ‘cola_public_1.1.zip’ saved [255330/255330]



In [6]:
df = pd.read_csv('./data/cola_public/raw/in_domain_train.tsv', delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
df.head()

Unnamed: 0,sentence_source,label,label_notes,sentence
0,gj04,1,,"Our friends won't buy this analysis, let alone..."
1,gj04,1,,One more pseudo generalization and I'm giving up.
2,gj04,1,,One more pseudo generalization or I'm giving up.
3,gj04,1,,"The more we study verbs, the crazier they get."
4,gj04,1,,Day by day the facts are getting murkier.


In [7]:
class GLUE(Dataset):
    def __init__(self, data, tokenizer, max_len):
        super().__init__()
        self.X = data.sentence.values
        self.y = data.label.values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        inputs = self.tokenizer.tokenize(self.X[idx])
        inputs = self.tokenizer.encode_plus(inputs,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            pad_to_max_length=True,
                                            truncation=True)
        
        return torch.LongTensor(inputs['input_ids']), torch.LongTensor(inputs['attention_mask']), self.y[idx]


In [57]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 3e-5
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
# testing the dataset
test = GLUE(df.head(), tokenizer, MAX_LEN)
test[0]



(tensor([  101,  2256,  2814,  2180,  1005,  1056,  4965,  2023,  4106,  1010,
          2292,  2894,  1996,  2279,  2028,  2057, 16599,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [10]:
# splitting up the data and throwing it in a dataloader
from sklearn.model_selection import train_test_split

X_train, X_test, _, _ = train_test_split(df, df.label, test_size=.2, random_state=42)
train_ds = GLUE(X_train, tokenizer, MAX_LEN)
valid_ds = GLUE(X_test, tokenizer, MAX_LEN)

train_dl = DataLoader(train_ds, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
valid_dl = DataLoader(valid_ds, batch_size=VALID_BATCH_SIZE)

In [11]:
x, mask, y = next(iter(train_dl))
x



tensor([[  101,  1996,  9540,  ...,     0,     0,     0],
        [  101,  5742,  2003,  ...,     0,     0,     0],
        [  101,  2952,  1051,  ...,     0,     0,     0],
        ...,
        [  101,  2984, 16849,  ...,     0,     0,     0],
        [  101, 20328, 17645,  ...,     0,     0,     0],
        [  101,  1045,  2404,  ...,     0,     0,     0]])

In [12]:
x.size(), mask.size(), y.size()

(torch.Size([16, 128]), torch.Size([16, 128]), torch.Size([16]))

In [13]:
class BERT(nn.Module):
    def __init__(self, dropout):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.classifier = torch.nn.Linear(768, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        seq, pool = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = F.relu(self.pre_classifier(pool))
        output = self.dropout(output)
        return self.classifier(output)

In [14]:
model = BERT(.3)
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       

In [72]:
def train_model(model, train_dl, valid_dl, optimizer, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss, total, n_correct = 0, 0, 0
        
        for idx, data in enumerate(train_dl):
            ids = data[0].to(device)
            mask = data[1].to(device)
            targets = torch.LongTensor(data[2]).to(device)
            batch_size = targets.shape[0]

            y_pred = model(ids, mask)
            loss = F.binary_cross_entropy_with_logits(y_pred, targets.float().unsqueeze(1))
            total_loss += loss.item() * batch_size
            total += batch_size
            out = (y_pred >= 0.5).float().squeeze()
            n_correct += out.eq(targets).sum().item()

            if idx % 100 == 0:
                print(f"Training Loss per 100 steps: {total_loss / total}")
                print(f"Training Accuracy per 100 steps: {n_correct/total}")

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        val_loss, val_acc = valid_model(model, valid_dl)
        print(f"Epoch {epoch+1} Training Loss: {total_loss/total} Training Accuracy: {n_correct/total} Valid Loss: {val_loss} Valid Accuracy: {val_acc}")    

In [73]:
def valid_model(model, valid_dl):
    model.eval()
    total_loss, total, n_correct = 0, 0, 0
    for data in valid_dl:
        ids = data[0].to(device)
        mask = data[1].to(device)
        targets = torch.LongTensor(data[2]).to(device)
        batch_size = targets.shape[0]

        y_pred = model(ids, mask)
        loss = F.binary_cross_entropy(torch.sigmoid(y_pred), targets.float().unsqueeze(1))
        total_loss += loss.item() * batch_size
        total += batch_size
        out = (y_pred >= 0.5).float().squeeze()
        n_correct += out.eq(targets).sum().item()

    return total_loss/total, n_correct/total

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
train_model(model, train_dl, valid_dl, optimizer, EPOCHS)



Training Loss per 100 steps: 0.0057329232804477215
Training Accuracy per 100 steps: 1.0
Training Loss per 100 steps: 0.058381374739802046
Training Accuracy per 100 steps: 0.9832920792079208
Training Loss per 100 steps: 0.07066349477671661
Training Accuracy per 100 steps: 0.9791666666666666
Training Loss per 100 steps: 0.07334175968637065
Training Accuracy per 100 steps: 0.9775747508305648
Training Loss per 100 steps: 0.07907062089401634
Training Accuracy per 100 steps: 0.9738154613466334
Epoch 1 Training Loss: 0.07811889574394631 Training Accuracy: 0.974561403508772 Valid Loss: 0.7202749164575795 Valid Accuracy: 0.8158971361776739
Training Loss per 100 steps: 0.017747756093740463
Training Accuracy per 100 steps: 1.0
Training Loss per 100 steps: 0.056130996077681086
Training Accuracy per 100 steps: 0.9746287128712872
Training Loss per 100 steps: 0.06342475089552668
Training Accuracy per 100 steps: 0.9760572139303483
Training Loss per 100 steps: 0.064611841165275
Training Accuracy per 10