In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-df0d0da0-ec12-48c9-5703-59009a3fba47)


In [1]:
%%capture install_log 

!pip install transformers datasets evaluate

In [2]:
import torch
import evaluate
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    PreTrainedTokenizerFast,
    RobertaConfig,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset

In [3]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('gngpostalsrvc/BERiT')

def preprocess(examples):
  
    encoding = tokenizer(examples['Text'], max_length=128, truncation=True, padding=True)
    encoding['labels'] = [[stage] for stage in examples['Stage']]

    return encoding

raw_data = load_dataset('gngpostalsrvc/COHeN')

tokenized_data = raw_data.map(preprocess, batched=True, remove_columns=raw_data['train'].column_names)
tokenized_data.set_format("pt", columns=["input_ids", "attention_mask", "labels"], output_all_columns=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/106 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/916 [00:00<?, ?B/s]

Downloading and preparing dataset csv/default (download: 1.08 MiB, generated: 2.56 MiB, post-processed: Unknown size, total: 3.64 MiB) to /root/.cache/huggingface/datasets/gngpostalsrvc___parquet/gngpostalsrvc--COHeN-97096b619f4d4787/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/890k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/120k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/121k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/9574 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1197 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1197 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/gngpostalsrvc___parquet/gngpostalsrvc--COHeN-97096b619f4d4787/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/9574 [00:00<?, ? examples/s]

Map:   0%|          | 0/1197 [00:00<?, ? examples/s]

Map:   0%|          | 0/1197 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForSequenceClassification.from_pretrained('gngpostalsrvc/BERiT', num_labels=4)

args = TrainingArguments(
    output_dir="COHeN",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.0027431492469971175,
    weight_decay=0.004900150335195089,
    num_train_epochs=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    hub_token='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',
    push_to_hub=True,
    seed=42,
)

def compute_metrics(eval_preds):
  metrics = evaluate.load('accuracy')
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metrics.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
  )

trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gngpostalsrvc--BERiT/snapshots/5799f2933c5845e55984f628ee81b985752c3897/config.json
Model config RobertaConfig {
  "_name_or_path": "gngpostalsrvc/BERiT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.5,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.5,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 128,
  "model_type": "roberta",
  "num_attention_heads": 4,
  "num_hidden_layers": 1,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.952309,0.607352
2,1.130200,0.722395,0.705931
3,1.130200,0.595825,0.771094
4,0.841700,0.592209,0.784461
5,0.667600,0.536114,0.827903
6,0.667600,0.532332,0.827068
7,0.559900,0.492757,0.84127
8,0.559900,0.453408,0.843776
9,0.508300,0.480029,0.844612
10,0.456900,0.487078,0.852966


***** Running Evaluation *****
  Num examples = 1197
  Batch size = 32


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Saving model checkpoint to COHeN/checkpoint-300
Configuration saved in COHeN/checkpoint-300/config.json
Model weights saved in COHeN/checkpoint-300/pytorch_model.bin
tokenizer config file saved in COHeN/checkpoint-300/tokenizer_config.json
Special tokens file saved in COHeN/checkpoint-300/special_tokens_map.json
tokenizer config file saved in COHeN/tokenizer_config.json
Special tokens file saved in COHeN/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1197
  Batch size = 32
Saving model checkpoint to COHeN/checkpoint-600
Configuration saved in COHeN/checkpoint-600/config.json
Model weights saved in COHeN/checkpoint-600/pytorch_model.bin
tokenizer config file saved in COHeN/checkpoint-600/tokenizer_config.json
Special tokens file saved in COHeN/checkpoint-600/special_tokens_map.json
tokenizer config file saved in COHeN/tokenizer_config.json
Special tokens file saved in COHeN/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1197
  Batch size

TrainOutput(global_step=6000, training_loss=0.5369424870808919, metrics={'train_runtime': 623.2086, 'train_samples_per_second': 307.249, 'train_steps_per_second': 9.628, 'total_flos': 121098830376384.0, 'train_loss': 0.5369424870808919, 'epoch': 20.0})