In [1]:
!pip install -q datasets

In [2]:
from datasets import load_dataset

data_cn = load_dataset("lbox/lbox_open", "casename_classification")

data_cn['validation'][10]

  from .autonotebook import tqdm as notebook_tqdm


{'id': 180,
 'casetype': 'criminal',
 'casename': '강제추행',
 'facts': '피고인은 2020. 1. 22. 02:00경 서울 서초구 B 소재 **** 주점 2번 테이블에서 술을 마시다가 화장실을 다녀오던 중, 갑자기 테이블 바깥쪽에 앉아있던 피해자 C(여, 39세)의 왼쪽 가슴을 3회 만져 피해자를 강제로 추행하였다.'}

In [6]:
data_cn.num_rows

{'train': 8000, 'validation': 1000, 'test': 1000, 'test2': 1294}

In [3]:
dataset = data_cn

In [4]:
!pip install -U adapters
!pip install datasets
!pip install accelerate -U

Requirement already up-to-date: adapters in /home/dhaabb55/.local/lib/python3.8/site-packages (0.1.0)
Requirement already up-to-date: accelerate in /home/dhaabb55/.local/lib/python3.8/site-packages (0.24.1)


In [5]:
!pip install sentencepiece



In [6]:
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["casename"],batch["facts"], max_length=128, truncation=True, padding="max_length")

# Encode the input data
dataset = dataset.map(encode_batch, batched=True)
# The transformers model expects the target class column to be named "labels"
dataset = dataset.rename_column(original_column_name="casetype", new_column_name="labels")
# Transform to pytorch tensors and only output the required columns
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

sentencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 5.99MB/s]
Map:   0%|          | 0/8000 [00:00<?, ? examples/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even 

In [7]:
from transformers import RobertaConfig
from adapters import AutoAdapterModel

config = RobertaConfig.from_pretrained(
    "xlm-roberta-base",
    num_labels=2,
)
model = AutoAdapterModel.from_pretrained(
    "xlm-roberta-base",
    config=config,
)

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
model.safetensors: 100%|██████████| 1.12G/1.12G [00:57<00:00, 19.3MB/s]
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Add a new adapter
model.add_adapter("rotten_tomatoes")
# Add a matching classification head
model.add_classification_head(
    "rotten_tomatoes",
    num_labels=2,
    id2label={ 0: "civil", 1: "criminal"}
  )
# Activate the adapter
model.train_adapter("rotten_tomatoes")

In [10]:
import numpy as np
from transformers import TrainingArguments, EvalPrediction
from adapters import AdapterTrainer

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_accuracy,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
from transformers import TextClassificationPipeline

classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=training_args.device.index)

classifier("This is awesome!")

In [None]:
model.save_adapter("./final_adapter", "rotten_tomatoes")

!ls -lh final_adapter