In [None]:
!nvidia-smi

Fri Feb 10 11:34:50 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    28W /  70W |      0MiB / 15360MiB |      6%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install ekphrasis scikit-learn pandas numpy torch transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ekphrasis
  Downloading ekphrasis-0.5.4-py3-none-any.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.8/83.8 KB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 KB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ujson
  Downloading

In [None]:
import random
import os
import json
import pandas as pd
import numpy as np
import torch
from torch import nn
import logging
from datasets import Dataset
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
def set_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)


def get_class_weights(train_set):
    return compute_class_weight(
                                    class_weight = 'balanced',
                                    classes = np.unique(train_set['label']),
                                    y = train_set['label']
                                )

class CustomTrainer(Trainer):
    def __init__(self, class_wts, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_wts = class_wts

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        weight = torch.tensor(self.class_wts).float().cuda()
        loss_fct = nn.CrossEntropyLoss(weight=weight)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


def train_model(model_name='roberta-base', train_path="./train_set.csv", num_epochs=4, num_labels=4, max_length=128, seed_val=42, batch_size=8, learning_rate=2e-5, weight_decay=1e-8, save_model_path="./models/", use_custom_loss=False):
    
    set_seed(seed_val)
    
    logger = logging.getLogger(__name__)
    logger.info(f'Using seed: {seed_val}')

    train_df = pd.read_csv(train_path)
    train_set = Dataset.from_pandas(train_df)

    logger.info(f'dataset loaded')

    tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=max_length)

    logger.info(f'tokenizer loaded')

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding='max_length', truncation=True)

    tokenized_dataset_train = train_set.map(tokenize_function, batched=True)

    logger.info(f'tokenized dataset')

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, 
                                                           max_length=max_length)

    logger.info(f'model loaded')

    training_args = TrainingArguments(output_dir="test_trainer",
                                    learning_rate=learning_rate,
                                    weight_decay=weight_decay,
                                    num_train_epochs=num_epochs,
                                    per_device_train_batch_size=batch_size,
                                    seed = seed_val)

    if use_custom_loss:
        class_wts = get_class_weights(train_set)
        trainer = CustomTrainer(class_wts, model=model, args=training_args, train_dataset=tokenized_dataset_train)
    else:
        trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset_train)
    
    logger.info(f'starting training')
    trainer.train()
    logger.info(f'training finished')

    logger.info(f'saving model and tokenizer')
    save_directory = os.path.join(save_model_path, model_name)
    os.makedirs(save_directory, exist_ok=True)
    tokenizer.save_pretrained(save_directory)
    model.save_pretrained(save_directory)

    logger.info(f'saving parameters')
    params = {
        'model_name': model_name,
        'num_epochs': num_epochs,
        'num_labels': num_labels,
        'max_length': max_length,
        'seed_val': seed_val,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'weight_decay': weight_decay,
        'use_custom_loss': use_custom_loss
    }

    with open(os.path.join(save_directory, 'params.json'), 'w') as f:
        json.dump(params, f)

In [None]:
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger(__name__)

In [None]:
logger.info('training model')
train_model(save_model_path='./models/')
train_model(train_path = './train_set_hand.csv', save_model_path='./models/hand/')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /root/.cache/huggingface/hub/mo

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_length": 128,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size"

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


tokenizer config file saved in ./models/roberta-base/tokenizer_config.json
Special tokens file saved in ./models/roberta-base/special_tokens_map.json
Configuration saved in ./models/roberta-base/config.json
Model weights saved in ./models/roberta-base/pytorch_model.bin
Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_length": 128,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size"

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


tokenizer config file saved in ./models/hand/roberta-base/tokenizer_config.json
Special tokens file saved in ./models/hand/roberta-base/special_tokens_map.json
Configuration saved in ./models/hand/roberta-base/config.json
Model weights saved in ./models/hand/roberta-base/pytorch_model.bin


In [None]:
import os
import json
import pandas as pd
import logging
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, Trainer, TrainingArguments
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
def predict(model_path="./models/roberta-base", test_path="./test_set.csv"):
    logger = logging.getLogger(__name__)
    logger.info(f'loading model, tokenizer and parameters')
    model, tokenizer, params = load_model(model_path)

    # Ensure that the truth values are python booleans
    if params["use_custom_loss"]:
        params["use_custom_loss"] = json.loads(params["use_custom_loss"].lower())

    logger.info(f'loading test set')
    test_df = pd.read_csv(test_path)

    pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0)

    logger.info(f'predicting on test set')
    predictions = pipeline(test_df['text'].tolist(), padding='max_length', truncation=True)

    # Open config.json
    with open(os.path.join(model_path, "config.json")) as f:
        config = json.load(f)
    
    # Get label2id from config.json
    label2id = config["label2id"]

    preds = [label2id[pred['label']] for pred in predictions]

    cf = classification_report(test_df['label'].tolist(), preds, digits=4, output_dict=True)
    print(classification_report(test_df['label'].tolist(), preds, digits=4))

    cm = confusion_matrix(test_df['label'].tolist(), preds)
    print(f'confusion matrix:\n{cm}')


def load_model(path="models/roberta-base/"):
    logger = logging.getLogger(__name__)
    logger.info(f'loading model from {path}')
    model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=4)
    logger.info(f'loading tokenizer from {path}')
    tokenizer = AutoTokenizer.from_pretrained(path)
    logger.info(f'loading parameters from {path}')
    with open(os.path.join(path, "params.json"), "r") as f:
        params = json.load(f)
    return model, tokenizer, params

In [None]:
logger.info('predicting')
predict()
predict(test_path='./test_set_hand.csv')
predict(model_path='./models/hand/roberta-base/')
predict(model_path='./models/hand/roberta-base/', test_path='./test_set_hand.csv')

loading configuration file ./models/roberta-base/config.json
Model config RobertaConfig {
  "_name_or_path": "./models/roberta-base",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_length": 128,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,


              precision    recall  f1-score   support

           0     0.5000    1.0000    0.6667        50
           1     0.0000    0.0000    0.0000        15
           2     0.0000    0.0000    0.0000        17
           3     0.0000    0.0000    0.0000        18

    accuracy                         0.5000       100
   macro avg     0.1250    0.2500    0.1667       100
weighted avg     0.2500    0.5000    0.3333       100

confusion matrix:
[[50  0  0  0]
 [15  0  0  0]
 [17  0  0  0]
 [18  0  0  0]]


All model checkpoint weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the model checkpoint at ./models/roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
loading configuration file ./models/hand/roberta-base/config.json
Model config RobertaConfig {
  "_name

              precision    recall  f1-score   support

           0     0.5000    1.0000    0.6667        50
           1     0.0000    0.0000    0.0000        15
           2     0.0000    0.0000    0.0000        17
           3     0.0000    0.0000    0.0000        18

    accuracy                         0.5000       100
   macro avg     0.1250    0.2500    0.1667       100
weighted avg     0.2500    0.5000    0.3333       100

confusion matrix:
[[50  0  0  0]
 [15  0  0  0]
 [17  0  0  0]
 [18  0  0  0]]


All model checkpoint weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the model checkpoint at ./models/hand/roberta-base/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
loading configuration file ./models/hand/roberta-base/config.json
Model config RobertaConfig {
  

              precision    recall  f1-score   support

           0     0.5000    1.0000    0.6667        50
           1     0.0000    0.0000    0.0000        15
           2     0.0000    0.0000    0.0000        17
           3     0.0000    0.0000    0.0000        18

    accuracy                         0.5000       100
   macro avg     0.1250    0.2500    0.1667       100
weighted avg     0.2500    0.5000    0.3333       100

confusion matrix:
[[50  0  0  0]
 [15  0  0  0]
 [17  0  0  0]
 [18  0  0  0]]


All model checkpoint weights were used when initializing RobertaForSequenceClassification.

All the weights of RobertaForSequenceClassification were initialized from the model checkpoint at ./models/hand/roberta-base/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForSequenceClassification for predictions without further training.
loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


              precision    recall  f1-score   support

           0     0.5000    1.0000    0.6667        50
           1     0.0000    0.0000    0.0000        15
           2     0.0000    0.0000    0.0000        17
           3     0.0000    0.0000    0.0000        18

    accuracy                         0.5000       100
   macro avg     0.1250    0.2500    0.1667       100
weighted avg     0.2500    0.5000    0.3333       100

confusion matrix:
[[50  0  0  0]
 [15  0  0  0]
 [17  0  0  0]
 [18  0  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
