In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer,DataCollatorWithPadding

raw_datasets=load_dataset("glue","mrpc")
checkpoint = "bert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"],example["sentence2"],truncation=True)

tokenized_dataset=raw_datasets.map(tokenize_function,batched=True)
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

# Pre-Training

### post processing
- Remove the columns corresponding to values the model does not expect (like the sentence1 and sentence2 columns).  
- Rename the column label to labels (because the model expects the argument to be named labels).  
- Set the format of the datasets so they return PyTorch tensors instead of lists.  


In [9]:
tokenized_dataset=tokenized_dataset.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_dataset=tokenized_dataset.rename_column("label","labels")
tokenized_dataset.set_format("torch")
tokenized_dataset["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [3]:
tokenized_dataset

In [10]:
from torch.utils.data import DataLoader

train_dataloader=DataLoader(
    tokenized_dataset["train"],shuffle=True,batch_size=8,collate_fn=data_collator
)
eval_dataloader=DataLoader(
    tokenized_dataset["validation"],shuffle=True,batch_size=8,collate_fn=data_collator
)

In [12]:
for batch in train_dataloader:
    break
{k: v.shape for k,v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 66]),
 'token_type_ids': torch.Size([8, 66]),
 'attention_mask': torch.Size([8, 66])}

 #### It might appear that batch is out of scope after the for loop, but in Python, variables defined inside a loop are accessible outside of it. The variable batch will still be in scope after the loop exits, and you can continue to use it.

In [13]:
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
output=model(**batch)
print(output.logits.shape, output.loss)
#All 🤗 Transformers models will return the loss when labels are provided,
#also get the logits (two for each input in our batch, so a tensor of size 8 x 2)

torch.Size([8, 2]) tensor(0.8705, grad_fn=<NllLossBackward0>)


### optimizer and learning rate scheduler.

In [18]:
from transformers import AdamW
optimizer=AdamW(model.parameters(),lr=5e-5)



In [20]:
from transformers import get_scheduler

num_epochs=3
num_training_steps=num_epochs*len(train_dataloader)
lr_scheduler=get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

1377


In [21]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [22]:
from tqdm.auto import tqdm
progress_bar=tqdm(range(num_training_steps))

model.train()
for epoch in range(1):
    for batch in train_dataloader:
        output=model(**batch)
        loss=output.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad
        progress_bar.update(1)
        



  0%|          | 0/1377 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [25]:
from accelerate import Accelerator

In [34]:
from transformers import AdamW,AutoModelForSequenceClassification,get_scheduler
accelerator=Accelerator()

model=AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)
optimizer=AdamW(model.parameters(),lr=3e-5)

train_dl,eval_dl,model,optimizer=accelerator.prepare(train_dataloader,eval_dataloader,model,optimizer)

num_epochs=3
num_training_steps=num_epochs*len(train_dl)

lr_scheduler=get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
    
)

progress_bar=tqdm(range(num_training_steps))

model.train()

for epoch in range(1):
    for batch in train_dl:
        output=model(**batch)
        loss=output.loss
        accelerator.backward(loss)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad
        progress_bar.update(1)


        






Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1377 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
a