In [2]:
directory = "~/data/morp/morp-balanced"

from torchtext import data

TEXT = data.Field(
    use_vocab=False,
    batch_first=True,
    sequential=False,

    
)
LABELS = data.Field(
    sequential=False,
    preprocessing=lambda xs: 1 if xs == "manhattan" else 0,
    use_vocab=False, 
    batch_first=True, 
)

train_ds, valid_ds, test_ds = data.TabularDataset.splits(
    path=directory,
    format='tsv',
    skip_header=False,
    train='train.tsv',
    validation='dev.tsv',
    test='test.tsv',
    fields=[
        ('label', LABELS),
        ('instructions', TEXT)])



print (train_ds.examples[0].instructions)


Richard Spiegel developed poetry workshops at the Jefferson Market Library, and facilitated them from 1982 to 1995.


In [3]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [4]:
train_texts = [train_ds.examples[idx].instructions for idx in range(len(train_ds))]
val_texts = [valid_ds.examples[idx].instructions for idx in range(len(valid_ds))]
test_texts = [test_ds.examples[idx].instructions for idx in range(len(test_ds))]


In [5]:
train_labels = [train_ds.examples[idx].label for idx in range(len(train_ds))]
val_labels = [valid_ds.examples[idx].label for idx in range(len(valid_ds))]
test_labels = [test_ds.examples[idx].label for idx in range(len(test_ds))]


In [6]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, add_special_tokens=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, add_special_tokens=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, add_special_tokens=True)

In [7]:


import torch

class CabbyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):

        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)





In [8]:
train_dataset = CabbyDataset(train_encodings, train_labels)
val_dataset = CabbyDataset(val_encodings, val_labels)
test_dataset = CabbyDataset(test_encodings, test_labels)

In [9]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
import numpy as np

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
valid_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    model.train()
    counter=0
    for batch in train_loader:
        counter+=1
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        loss, logits = model(input_ids, attention_mask=attention_mask, labels=labels)
        # print (outputs)
        # loss = outputs[0]
        loss.backward()
        optim.step()
        
        if counter>200 and counter%100==0:
            model.eval()
            success = 0
            all_count = 0
            for batch in valid_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                loss, logits = model(input_ids, attention_mask=attention_mask, labels=labels)
                topv, topi = logits.squeeze().topk(1)
                topi=topi.squeeze().detach().cpu().numpy()

                label_ids = labels.to('cpu').numpy()
                success+=np.sum(label_ids==topi)
                all_count+=label_ids.shape[0]
                # print(topi,label_ids)
            print (success/all_count)



Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

KeyboardInterrupt: 