### processing the data

In [1]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

In [2]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")

raw_datasets

Reusing dataset glue (C:\Users\codenamewei\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


DatasetDict({
    train: Dataset({
        features: ['idx', 'label', 'sentence1', 'sentence2'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['idx', 'label', 'sentence1', 'sentence2'],
        num_rows: 408
    })
    test: Dataset({
        features: ['idx', 'label', 'sentence1', 'sentence2'],
        num_rows: 1725
    })
})

# Reveal information of each column

In [4]:
raw_datasets["test"].features

{'idx': Value(dtype='int32', id=None),
 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),
 'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None)}

### Get a glimpse of the dataset

In [5]:
raw_datasets["train"][4]

{'idx': 4,
 'label': 1,
 'sentence1': 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .',
 'sentence2': 'PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .'}

# Tokenize the dataset

In [6]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True) #padding = True)

Note that we’ve left the padding argument out in our tokenization function for now. This is because padding all the samples to the maximum length is not efficient: it’s better to pad the samples when we’re building a batch, as then we only need to pad to the maximum length in that batch, and not the maximum length in the entire dataset. This can save a lot of time and processing power when the inputs have very variable lengths! 

In [7]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched = True)

100%|██████████| 4/4 [00:00<00:00, 19.49ba/s]
100%|██████████| 1/1 [00:00<00:00, 53.52ba/s]
100%|██████████| 2/2 [00:00<00:00, 28.86ba/s]


In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### To train the model , first define TrainingArguments class that will contain all the hyperparameters the Trainer will use for training and evalution. 

In [13]:
training_args = TrainingArguments(output_dir = "training-output/test-trainer-gpu")

In [14]:
trainer = Trainer(
    model, 
    training_args, 
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator = data_collator, 
    tokenizer = tokenizer)# can skip this since passed in data_collator

In [15]:
trainer.train()

  1%|          | 10/1377 [00:00<01:46, 12.83it/s]

{'loss': 0.1961, 'learning_rate': 4.963689179375454e-05, 'epoch': 0.02}



  1%|          | 12/1377 [00:01<05:09,  4.41it/s]

{'eval_loss': 0.576067328453064, 'eval_runtime': 0.8483, 'eval_samples_per_second': 480.961, 'epoch': 0.02}


  1%|▏         | 20/1377 [00:02<02:31,  8.94it/s]

{'loss': 0.1827, 'learning_rate': 4.927378358750908e-05, 'epoch': 0.04}



  2%|▏         | 22/1377 [00:03<05:14,  4.31it/s]

{'eval_loss': 1.0718399286270142, 'eval_runtime': 0.8373, 'eval_samples_per_second': 487.301, 'epoch': 0.04}


  2%|▏         | 30/1377 [00:04<02:36,  8.59it/s]

{'loss': 0.2733, 'learning_rate': 4.891067538126362e-05, 'epoch': 0.07}



  2%|▏         | 32/1377 [00:05<05:20,  4.19it/s]

{'eval_loss': 0.6204397678375244, 'eval_runtime': 0.8429, 'eval_samples_per_second': 484.06, 'epoch': 0.07}


  3%|▎         | 40/1377 [00:05<02:35,  8.58it/s]

{'loss': 0.1939, 'learning_rate': 4.854756717501816e-05, 'epoch': 0.09}



  3%|▎         | 42/1377 [00:06<05:11,  4.29it/s]

{'eval_loss': 0.5554633736610413, 'eval_runtime': 0.8418, 'eval_samples_per_second': 484.673, 'epoch': 0.09}


  4%|▎         | 50/1377 [00:07<02:34,  8.61it/s]

{'loss': 0.0959, 'learning_rate': 4.8184458968772694e-05, 'epoch': 0.11}



  4%|▍         | 52/1377 [00:08<05:15,  4.20it/s]

{'eval_loss': 0.7873954176902771, 'eval_runtime': 0.8611, 'eval_samples_per_second': 473.798, 'epoch': 0.11}


  4%|▍         | 60/1377 [00:09<02:36,  8.42it/s]

{'loss': 0.2729, 'learning_rate': 4.7821350762527234e-05, 'epoch': 0.13}



  5%|▍         | 62/1377 [00:10<05:15,  4.16it/s]

{'eval_loss': 0.8744348287582397, 'eval_runtime': 0.8564, 'eval_samples_per_second': 476.413, 'epoch': 0.13}


  5%|▌         | 70/1377 [00:10<02:37,  8.31it/s]

{'loss': 0.0024, 'learning_rate': 4.7458242556281774e-05, 'epoch': 0.15}



  5%|▌         | 72/1377 [00:11<05:12,  4.18it/s]

{'eval_loss': 0.9205788969993591, 'eval_runtime': 0.8535, 'eval_samples_per_second': 478.013, 'epoch': 0.15}


  6%|▌         | 80/1377 [00:12<02:36,  8.31it/s]

{'loss': 0.2387, 'learning_rate': 4.709513435003631e-05, 'epoch': 0.17}



  6%|▌         | 82/1377 [00:13<05:12,  4.15it/s]

{'eval_loss': 0.8418758511543274, 'eval_runtime': 0.8675, 'eval_samples_per_second': 470.316, 'epoch': 0.17}


  7%|▋         | 90/1377 [00:14<02:36,  8.23it/s]

{'loss': 0.1872, 'learning_rate': 4.673202614379085e-05, 'epoch': 0.2}



  7%|▋         | 92/1377 [00:15<05:08,  4.17it/s]

{'eval_loss': 0.8375437259674072, 'eval_runtime': 0.8587, 'eval_samples_per_second': 475.13, 'epoch': 0.2}


  7%|▋         | 100/1377 [00:15<02:36,  8.16it/s]

{'loss': 0.1866, 'learning_rate': 4.636891793754539e-05, 'epoch': 0.22}



  7%|▋         | 101/1377 [00:16<05:14,  4.05it/s]

{'eval_loss': 0.8027874827384949, 'eval_runtime': 0.8694, 'eval_samples_per_second': 469.272, 'epoch': 0.22}


  8%|▊         | 110/1377 [00:17<02:31,  8.36it/s]

{'loss': 0.3456, 'learning_rate': 4.600580973129993e-05, 'epoch': 0.24}



  8%|▊         | 111/1377 [00:18<05:03,  4.17it/s]

{'eval_loss': 0.7307906150817871, 'eval_runtime': 0.859, 'eval_samples_per_second': 474.986, 'epoch': 0.24}


  9%|▊         | 120/1377 [00:19<02:29,  8.42it/s]

{'loss': 0.3425, 'learning_rate': 4.564270152505447e-05, 'epoch': 0.26}



  9%|▉         | 121/1377 [00:20<04:59,  4.20it/s]

{'eval_loss': 0.825389564037323, 'eval_runtime': 0.8603, 'eval_samples_per_second': 474.274, 'epoch': 0.26}


  9%|▉         | 130/1377 [00:21<02:30,  8.30it/s]

{'loss': 0.2099, 'learning_rate': 4.5279593318809005e-05, 'epoch': 0.28}



 10%|▉         | 131/1377 [00:21<05:01,  4.14it/s]

{'eval_loss': 0.5968772172927856, 'eval_runtime': 0.8639, 'eval_samples_per_second': 472.291, 'epoch': 0.28}


 10%|█         | 140/1377 [00:22<02:28,  8.32it/s]

{'loss': 0.1881, 'learning_rate': 4.4916485112563545e-05, 'epoch': 0.31}



 10%|█         | 141/1377 [00:23<05:00,  4.11it/s]

{'eval_loss': 0.5848471522331238, 'eval_runtime': 0.8782, 'eval_samples_per_second': 464.568, 'epoch': 0.31}


 11%|█         | 150/1377 [00:24<02:27,  8.30it/s]

{'loss': 0.2986, 'learning_rate': 4.4553376906318085e-05, 'epoch': 0.33}



 11%|█         | 151/1377 [00:25<05:00,  4.08it/s]

{'eval_loss': 0.6254045963287354, 'eval_runtime': 0.8871, 'eval_samples_per_second': 459.917, 'epoch': 0.33}


 12%|█▏        | 160/1377 [00:26<02:26,  8.28it/s]

{'loss': 0.0652, 'learning_rate': 4.4190268700072624e-05, 'epoch': 0.35}



 12%|█▏        | 161/1377 [00:27<04:59,  4.07it/s]

{'eval_loss': 0.5782719254493713, 'eval_runtime': 0.8882, 'eval_samples_per_second': 459.366, 'epoch': 0.35}


 12%|█▏        | 170/1377 [00:27<02:26,  8.26it/s]

{'loss': 0.2196, 'learning_rate': 4.3827160493827164e-05, 'epoch': 0.37}



 12%|█▏        | 171/1377 [00:28<04:52,  4.13it/s]

{'eval_loss': 0.7075920701026917, 'eval_runtime': 0.8739, 'eval_samples_per_second': 466.873, 'epoch': 0.37}


 13%|█▎        | 180/1377 [00:29<02:25,  8.24it/s]

{'loss': 0.4173, 'learning_rate': 4.3464052287581704e-05, 'epoch': 0.39}



 13%|█▎        | 181/1377 [00:30<04:51,  4.10it/s]

{'eval_loss': 0.7851341366767883, 'eval_runtime': 0.872, 'eval_samples_per_second': 467.916, 'epoch': 0.39}


 14%|█▍        | 190/1377 [00:31<02:22,  8.31it/s]

{'loss': 0.1017, 'learning_rate': 4.3100944081336244e-05, 'epoch': 0.41}



 14%|█▍        | 191/1377 [00:32<04:48,  4.12it/s]

{'eval_loss': 0.5822449922561646, 'eval_runtime': 0.8804, 'eval_samples_per_second': 463.444, 'epoch': 0.41}


 15%|█▍        | 200/1377 [00:33<02:22,  8.26it/s]

{'loss': 0.2834, 'learning_rate': 4.273783587509078e-05, 'epoch': 0.44}



 15%|█▍        | 201/1377 [00:34<04:46,  4.10it/s]

{'eval_loss': 0.627220094203949, 'eval_runtime': 0.8804, 'eval_samples_per_second': 463.415, 'epoch': 0.44}


 15%|█▌        | 210/1377 [00:34<02:20,  8.31it/s]

{'loss': 0.1546, 'learning_rate': 4.2374727668845316e-05, 'epoch': 0.46}



 15%|█▌        | 211/1377 [00:35<04:46,  4.07it/s]

{'eval_loss': 0.6770721077919006, 'eval_runtime': 0.8937, 'eval_samples_per_second': 456.527, 'epoch': 0.46}


 16%|█▌        | 220/1377 [00:36<02:20,  8.25it/s]

{'loss': 0.2158, 'learning_rate': 4.2011619462599856e-05, 'epoch': 0.48}



 16%|█▌        | 221/1377 [00:37<04:42,  4.09it/s]

{'eval_loss': 0.9312111139297485, 'eval_runtime': 0.8834, 'eval_samples_per_second': 461.833, 'epoch': 0.48}


 17%|█▋        | 230/1377 [00:38<02:17,  8.34it/s]

{'loss': 0.2241, 'learning_rate': 4.1648511256354396e-05, 'epoch': 0.5}



 17%|█▋        | 231/1377 [00:39<04:38,  4.11it/s]

{'eval_loss': 0.851260244846344, 'eval_runtime': 0.8839, 'eval_samples_per_second': 461.61, 'epoch': 0.5}


 17%|█▋        | 240/1377 [00:39<02:19,  8.13it/s]

{'loss': 0.2603, 'learning_rate': 4.1285403050108935e-05, 'epoch': 0.52}



 18%|█▊        | 241/1377 [00:40<04:39,  4.06it/s]

{'eval_loss': 0.7680224776268005, 'eval_runtime': 0.8893, 'eval_samples_per_second': 458.802, 'epoch': 0.52}


 18%|█▊        | 250/1377 [00:41<02:17,  8.18it/s]

{'loss': 0.1684, 'learning_rate': 4.0922294843863475e-05, 'epoch': 0.54}



 18%|█▊        | 251/1377 [00:42<04:37,  4.05it/s]

{'eval_loss': 0.9593198895454407, 'eval_runtime': 0.8914, 'eval_samples_per_second': 457.697, 'epoch': 0.54}


 19%|█▉        | 260/1377 [00:43<02:17,  8.11it/s]

{'loss': 0.1396, 'learning_rate': 4.0559186637618015e-05, 'epoch': 0.57}



 19%|█▉        | 261/1377 [00:44<04:36,  4.03it/s]

{'eval_loss': 0.6982465982437134, 'eval_runtime': 0.8935, 'eval_samples_per_second': 456.625, 'epoch': 0.57}


 20%|█▉        | 270/1377 [00:45<02:18,  8.00it/s]

{'loss': 0.0377, 'learning_rate': 4.0196078431372555e-05, 'epoch': 0.59}



 20%|█▉        | 271/1377 [00:46<04:35,  4.02it/s]

{'eval_loss': 0.807191014289856, 'eval_runtime': 0.8931, 'eval_samples_per_second': 456.84, 'epoch': 0.59}


 20%|██        | 280/1377 [00:46<02:14,  8.13it/s]

{'loss': 0.1531, 'learning_rate': 3.9832970225127094e-05, 'epoch': 0.61}



 20%|██        | 281/1377 [00:47<04:31,  4.04it/s]

{'eval_loss': 0.8555824160575867, 'eval_runtime': 0.8982, 'eval_samples_per_second': 454.25, 'epoch': 0.61}


 21%|██        | 290/1377 [00:48<02:15,  8.00it/s]

{'loss': 0.0849, 'learning_rate': 3.946986201888163e-05, 'epoch': 0.63}



 21%|██        | 291/1377 [00:49<04:35,  3.95it/s]

{'eval_loss': 0.950406551361084, 'eval_runtime': 0.9263, 'eval_samples_per_second': 440.458, 'epoch': 0.63}


 22%|██▏       | 300/1377 [00:50<02:13,  8.09it/s]

{'loss': 0.2814, 'learning_rate': 3.910675381263617e-05, 'epoch': 0.65}



 22%|██▏       | 301/1377 [00:51<04:31,  3.96it/s]

{'eval_loss': 0.8430094122886658, 'eval_runtime': 0.9153, 'eval_samples_per_second': 445.745, 'epoch': 0.65}


 23%|██▎       | 310/1377 [00:52<02:11,  8.13it/s]

{'loss': 0.162, 'learning_rate': 3.874364560639071e-05, 'epoch': 0.68}



 23%|██▎       | 311/1377 [00:53<04:23,  4.04it/s]

{'eval_loss': 0.8655065894126892, 'eval_runtime': 0.8884, 'eval_samples_per_second': 459.269, 'epoch': 0.68}


 23%|██▎       | 320/1377 [00:53<02:09,  8.14it/s]

{'loss': 0.1491, 'learning_rate': 3.8380537400145246e-05, 'epoch': 0.7}



 23%|██▎       | 321/1377 [00:54<04:20,  4.06it/s]

{'eval_loss': 0.8159526586532593, 'eval_runtime': 0.8927, 'eval_samples_per_second': 457.046, 'epoch': 0.7}


 24%|██▍       | 330/1377 [00:55<02:06,  8.30it/s]

{'loss': 0.084, 'learning_rate': 3.8017429193899786e-05, 'epoch': 0.72}



 24%|██▍       | 331/1377 [00:56<04:20,  4.02it/s]

{'eval_loss': 0.9185711741447449, 'eval_runtime': 0.8978, 'eval_samples_per_second': 454.45, 'epoch': 0.72}


 25%|██▍       | 340/1377 [00:57<02:06,  8.18it/s]

{'loss': 0.007, 'learning_rate': 3.7654320987654326e-05, 'epoch': 0.74}




cpu training
```
TrainOutput(global_step=1377, training_loss=0.3178706283403118, metrics={'train_runtime': 863.3071, 'train_samples_per_second': 1.595, 'total_flos': 141940900890768.0, 'epoch': 3.0})
```

gpu training
```
TrainOutput(global_step=1377, training_loss=0.464037926368464, metrics={'train_runtime': 118.9554, 'train_samples_per_second': 11.576, 'total_flos': 141940900890768.0, 'epoch': 3.0})
```