In [1]:
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer , DataCollatorWithPadding
from datasets import load_dataset
import pandas as pd
import numpy as np

In [19]:
def create_data(data_path,labels_path,out_name):
    data = pd.read_csv(data_path)
    data_labels = pd.read_csv(labels_path,header=None)
    labels = np.abs(1 - data_labels.iloc[:,1])
    data['labels'] = labels
    data.drop('id',axis=1).to_csv(out_name + '.csv',index=False)
    return data

In [20]:
train_path = 'AllData/TrainingData/subtaskA_data_all.csv'
train_labels_path = 'AllData/TrainingData/subtaskA_answers_all.csv'
dev_path = 'AllData/DevData/subtaskA_dev_data.csv'
dev_labels_path = 'AllData/DevData/subtaskA_gold_answers.csv'
test_path = 'AllData/TestData/subtaskA_test_data.csv'
test_labels_path = 'AllData/TestData/subtaskA_gold_answers.csv' 
data_train = create_data(train_path,train_labels_path,'Train')
data_dev = create_data(dev_path,dev_labels_path,'Dev')
data_test = create_data(test_path,test_labels_path,'Test')

In [2]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [3]:
data = load_dataset('csv',data_files={'train':'Train.csv',
                                      'validation': 'Dev.csv',
                                      'test': 'Test.csv'})
data

Using custom data configuration default-559d4f6e180b2b63
Reusing dataset csv (C:\Users\debal\.cache\huggingface\datasets\csv\default-559d4f6e180b2b63\0.0.0\2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


DatasetDict({
    train: Dataset({
        features: ['sent0', 'sent1', 'Label'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['Label', 'sent0', 'sent1'],
        num_rows: 997
    })
    test: Dataset({
        features: ['Label', 'sent0', 'sent1'],
        num_rows: 1000
    })
})

In [4]:
data['train'][0]

{'sent0': 'He poured orange juice on his cereal.',
 'sent1': 'He poured milk on his cereal.',
 'Label': 1}

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,use_fast=True)

In [6]:
def preprocess(example):
    return tokenizer(example['sent0'],example['sent1'],truncation=True,padding=True)

In [7]:
preprocess(data['train'][:3])

{'input_ids': [[101, 2002, 8542, 4589, 10869, 2006, 2010, 20943, 1012, 102, 2002, 8542, 6501, 2006, 2010, 20943, 1012, 102], [101, 2002, 8974, 6207, 1012, 102, 2002, 8974, 6501, 1012, 102, 0, 0, 0, 0, 0, 0, 0], [101, 5076, 2743, 1037, 3542, 2651, 102, 5076, 2743, 2531, 1010, 2199, 2661, 2651, 102, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}

In [8]:
encoded_data = data.map(preprocess,batched=True)

Loading cached processed dataset at C:\Users\debal\.cache\huggingface\datasets\csv\default-559d4f6e180b2b63\0.0.0\2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0\cache-20476fc45944cb65.arrow
Loading cached processed dataset at C:\Users\debal\.cache\huggingface\datasets\csv\default-559d4f6e180b2b63\0.0.0\2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0\cache-5e0f673d635bd660.arrow
Loading cached processed dataset at C:\Users\debal\.cache\huggingface\datasets\csv\default-559d4f6e180b2b63\0.0.0\2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0\cache-e029f18cceb1b3fe.arrow


In [9]:
def add_column(example):
    x = {}
    x['labels'] = example['Label']
    return x

In [10]:
encoded_data = encoded_data.map(add_column,batched=True)

Loading cached processed dataset at C:\Users\debal\.cache\huggingface\datasets\csv\default-559d4f6e180b2b63\0.0.0\2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0\cache-56989bef87f5e6de.arrow
Loading cached processed dataset at C:\Users\debal\.cache\huggingface\datasets\csv\default-559d4f6e180b2b63\0.0.0\2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0\cache-1de38d1fda347de8.arrow
Loading cached processed dataset at C:\Users\debal\.cache\huggingface\datasets\csv\default-559d4f6e180b2b63\0.0.0\2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0\cache-999e0fab62437d73.arrow


In [11]:
encoded_data

DatasetDict({
    train: Dataset({
        features: ['Label', 'attention_mask', 'input_ids', 'labels', 'sent0', 'sent1'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['Label', 'attention_mask', 'input_ids', 'labels', 'sent0', 'sent1'],
        num_rows: 997
    })
    test: Dataset({
        features: ['Label', 'attention_mask', 'input_ids', 'labels', 'sent0', 'sent1'],
        num_rows: 1000
    })
})

In [12]:
encoded_data = encoded_data.remove_columns(
    ["Label", "sent0",'sent1']
)

In [13]:
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [15]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len()
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

TypeError: object of type 'int' has no len()

In [16]:
metric_name = "accuracy"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    "test-glue",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions,axis=1)
    #print(type(predictions),type(labels))
    return metric.compute(predictions=predictions, references=labels)

In [18]:
validation_key = "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_data["train"],
    eval_dataset=encoded_data[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [39]:
encoded_data

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 997
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels'],
        num_rows: 1000
    })
})

In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.078,1.432153,0.463017


KeyError: 'eval_accuracy'

In [82]:
encoded_data

DatasetDict({
    train: Dataset({
        features: ['Label', 'attention_mask', 'input_ids', 'labels', 'sent0', 'sent1'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['Label', 'attention_mask', 'input_ids', 'labels', 'sent0', 'sent1'],
        num_rows: 997
    })
    test: Dataset({
        features: ['Label', 'attention_mask', 'input_ids', 'labels', 'sent0', 'sent1'],
        num_rows: 1000
    })
})

In [32]:
from datasets import load_dataset, load_metric

In [33]:
task = 'cola'
actual_task = "mnli" if task == "mnli-mm" else task
metric = load_metric('glue', actual_task)

In [25]:
import numpy as np

In [100]:
model.device

device(type='cuda', index=0)

In [104]:
from torch.utils.data import DataLoader

In [110]:
bs = 16
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader_1 = DataLoader(
    encoded_data["train"], shuffle=True, batch_size=bs, collate_fn=data_collator
)


In [111]:
for batch in train_dataloader_1:
    break
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([16, 53]),
 'input_ids': torch.Size([16, 53]),
 'labels': torch.Size([16])}

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
         0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [118]:
model(**batch)

SequenceClassifierOutput(loss=tensor(0.7008, grad_fn=<NllLossBackward>), logits=tensor([[ 0.0344,  0.0844],
        [-0.0305,  0.0512],
        [-0.0082,  0.1706],
        [-0.0429,  0.1338],
        [-0.0226,  0.1173],
        [-0.0182,  0.0580],
        [ 0.0972,  0.1071],
        [ 0.0239,  0.1170],
        [ 0.0492,  0.0142],
        [ 0.0039,  0.0759],
        [-0.0219,  0.1537],
        [ 0.0161,  0.0402],
        [-0.0403, -0.0057],
        [ 0.0205,  0.1038],
        [-0.0116,  0.0541],
        [ 0.0303,  0.1003]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)