### processing the data

In [None]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")

raw_datasets

# Reveal information of each column

In [None]:
raw_datasets["test"].features

### Get a glimpse of the dataset

In [None]:
raw_datasets["train"][4]

# Tokenize the dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True) #padding = True)

Note that we’ve left the padding argument out in our tokenization function for now. This is because padding all the samples to the maximum length is not efficient: it’s better to pad the samples when we’re building a batch, as then we only need to pad to the maximum length in that batch, and not the maximum length in the entire dataset. This can save a lot of time and processing power when the inputs have very variable lengths! 

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### To train the model , first define TrainingArguments class that will contain all the hyperparameters the Trainer will use for training and evalution. 

In [None]:
training_args = TrainingArguments(output_dir = "training-output/test-trainer-gpu")


# training_args = TrainingArguments(output_dir = "training-output/test-trainer-gpu",
# save_strategy = "epoch", 
# push_to_hub = True, 
# hub_model_id = "organization/repo-name")

In [None]:
trainer = Trainer(
    model, 
    training_args, 
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator = data_collator, 
    tokenizer = tokenizer)# can skip this since passed in data_collator

In [None]:
trainer.train()

cpu training
```
TrainOutput(global_step=1377, training_loss=0.3178706283403118, metrics={'train_runtime': 863.3071, 'train_samples_per_second': 1.595, 'total_flos': 141940900890768.0, 'epoch': 3.0})
```

gpu training
```
TrainOutput(global_step=1377, training_loss=0.464037926368464, metrics={'train_runtime': 118.9554, 'train_samples_per_second': 11.576, 'total_flos': 141940900890768.0, 'epoch': 3.0})
```

In [None]:
# push to the hub

#trainer.push_to_hub("End of training")

#alternatively push to hub possible with model, tokenizer