In [None]:
!pip install git+https://github.com/cosmoquester/transformers-tf-finetune.git

In [None]:
import argparse
import csv
import random
import sys
import urllib.request
from typing import Tuple

import tensorflow as tf
from transformers import AdamWeightDecay, AutoTokenizer

from transformers_tf_finetune.models import TFBartForSequenceMultiClassification
from transformers_tf_finetune.utils import LRScheduler, get_device_strategy, get_logger, path_join, set_random_seed

# Config

In [None]:
#: transformers pretrained path
pretrained_model = "cosmoquester/bart-ko-small"
#: pretrained tokenizer fast pretrained path
pretrained_tokenizer = "cosmoquester/bart-ko-small"
#: load from pytorch weight
from_pytorch = False
#: use huggingface credential for private model
use_auth_token = ""

train_dataset_path = "https://raw.githubusercontent.com/kocohub/korean-hate-speech/master/labeled/train.tsv"
dev_dataset_path = "https://raw.githubusercontent.com/kocohub/korean-hate-speech/master/labeled/dev.tsv"
#: output directory to save log and model checkpoints, should be GCS path with TPU
output_path = None

#: training params
epochs = 5
learning_rate = 5e-5
min_learning_rate = 1e-5
warmup_rate = 0.06
warmup_steps = None
batch_size = 128
dev_batch_size = 512
num_valid_dataset = 500
tensorboard_update_freq = 1

#: device to use (TPU or GPU or CPU)
device = "TPU"
#: Use mixed precision FP16
mixed_precision = False
#: Set random seed
seed = None

In [None]:
if output_path is not None and output_path.startswith("gs://"):
  from google.colab import auth
  auth.authenticate_user()

In [None]:
def load_dataset(dataset_path: str, tokenizer: AutoTokenizer, shuffle: bool = False) -> tf.data.Dataset:
    """
    Load Hate Speech dataset from local file or web

    :param dataset_path: local file path or file uri
    :param tokenizer: PreTrainedTokenizer for tokenizing
    :param shuffle: whether shuffling lines or not
    :returns: Hate Speech dataset, number of dataset
    """
    if dataset_path.startswith("https://"):
        with urllib.request.urlopen(dataset_path) as response:
            data = response.read().decode("utf-8")
    else:
        with open(dataset_path) as f:
            data = f.read()
    lines = data.splitlines()[1:]
    if shuffle:
        random.shuffle(lines)

    bos = tokenizer.bos_token
    eos = tokenizer.eos_token

    bias_label2id = {"none": 0, "gender": 1, "others": 2}
    hate_label2id = {"none": 0, "hate": 1, "offensive": 2}

    sentences = []
    bias_labels = []
    hate_labels = []
    for comment, _, bias_label, hate_label in csv.reader(lines, delimiter="\t"):
        sentences.append(bos + comment + eos)
        bias_labels.append(bias_label2id[bias_label])
        hate_labels.append(hate_label2id[hate_label])

    inputs = dict(
        tokenizer(
            sentences,
            padding=True,
            return_tensors="tf",
            return_token_type_ids=False,
            return_attention_mask=True,
        )
    )

    dataset = tf.data.Dataset.from_tensor_slices((inputs, {"bias": bias_labels, "hate": hate_labels}))
    return dataset

In [None]:
if seed:
    set_random_seed(seed)

In [None]:
strategy = get_device_strategy(device)

# Mixed Precision

In [None]:
with strategy.scope():
    if mixed_precision:
        mixed_type = "mixed_bfloat16" if device == "TPU" else "mixed_float16"
        policy = tf.keras.mixed_precision.experimental.Policy(mixed_type)
        tf.keras.mixed_precision.experimental.set_policy(policy)

# Load Dataset

In [None]:
with strategy.scope():
        tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer, use_auth_token=use_auth_token)

        dataset = load_dataset(train_dataset_path, tokenizer, True)
        train_dataset = dataset.skip(num_valid_dataset).batch(batch_size)
        valid_dataset = dataset.take(num_valid_dataset).batch(dev_batch_size)
        dev_dataset = load_dataset(dev_dataset_path, tokenizer).batch(dev_batch_size)

# Load Model

In [None]:
with strategy.scope():
    model = TFBartForSequenceMultiClassification.from_pretrained(
        pretrained_model,
        list_num_labels={"bias": 3, "hate": 3},
        use_auth_token=use_auth_token,
        from_pt=from_pytorch,
    )

# Model Compile

In [None]:
with strategy.scope():
    model.compile(
        optimizer=AdamWeightDecay(
            LRScheduler(
                len(train_dataset) * epochs,
                learning_rate,
                min_learning_rate,
                warmup_rate,
                warmup_steps,
            ),
            weight_decay_rate=0.01,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
        ),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")],
    )

# Model Training

In [None]:
with strategy.scope():
    model.fit(
        train_dataset,
        validation_data=valid_dataset,
        epochs=epochs,
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint(
                path_join(output_path, "best_model.ckpt"),
                save_weights_only=True,
                save_best_only=True,
                monitor="val_loss",
                mode="min",
                verbose=1,
            ),
            tf.keras.callbacks.TensorBoard(
                path_join(output_path, "logs"), update_freq=tensorboard_update_freq
            ),
        ] if output_path is not None else None,
    )

# Model Evaluate

In [None]:
with strategy.scope():
    total_loss, bias_loss, hate_loss, bias_accuracy, hate_accuracy = model.evaluate(dev_dataset)