In [1]:
# some configurations
dataset_dir = 'DATASET_DIR'
bert_trained_model_dir = 'MODEL_SAVE_DIR'

In [2]:
from sklearn import model_selection


def train_test_val_split(datasets_df, ratio_train, ratio_test, ratio_val):
    """
    Divide the data set into training set, test set and validation set
    """
    assert ratio_train + ratio_test + ratio_val == 1

    train_df, middle_df = model_selection.train_test_split(datasets_df,
                                                           random_state=6,
                                                           test_size=1 - ratio_train)
    ratio = ratio_val / (1 - ratio_train)

    test_df, validation_df = model_selection.train_test_split(middle_df,
                                                              random_state=6,
                                                              test_size=ratio)

    return train_df, test_df, validation_df

In [3]:
import os
import shutil

from simpletransformers.classification import ClassificationModel
import pandas as pd

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

train_batch_size = 16
eval_batch_size = 16

In [None]:
# load dataset
PC_datasets_path = os.path.join(dataset_dir, 'pc.csv')
train_ratio = 0.6
test_ratio = 0.2
validation_ratio = 0.2

datasets_df = pd.read_csv(PC_datasets_path, sep=',', header=None)
train_df, test_df, validation_df = train_test_val_split(datasets_df, train_ratio, test_ratio, validation_ratio)

print('Train dataset size: {}\n Test dataset size: {}\n Validation dataset size: {}'.format(len(train_df), len(test_df),
                                                                                            len(validation_df)))

In [None]:
model_args = {'best_model_dir': os.path.join(bert_trained_model_dir, 'PC/best_model'),
              'early_stopping_consider_epochs': False,
              'early_stopping_delta': 0.02,
              'early_stopping_patience': 8,
              'eval_batch_size': eval_batch_size,
              'evaluate_during_training': True,
              'evaluate_during_training_steps': int(len(train_df) / train_batch_size),
              'evaluate_during_training_verbose': True,
              'manual_seed': 777,
              'num_train_epochs': 30,
              'overwrite_output_dir': True,
              'output_dir': os.path.join(bert_trained_model_dir, 'PC'),
              'save_eval_checkpoints': False,
              'save_model_every_epoch': False,
              'save_steps': -1,
              'train_batch_size': train_batch_size,
              'use_early_stopping': True,
              'use_multiprocessing': False,
              'use_multiprocessing_for_evaluation': False
              }

model = ClassificationModel('bert', 'bert-base-chinese', num_labels=8, args=model_args)

In [6]:
try:
    shutil.rmtree(model.args.best_model_dir)
    shutil.rmtree(model.args.output_dir)
except:
    pass

In [None]:
# model training
model.train_model(train_df, eval_df=validation_df)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score


def f1_multiclass(labels, predictions):
    return f1_score(labels, predictions, average='micro')


def precision_multiclass(labels, predictions):
    return precision_score(labels, predictions, average='micro')


def recall_multiclass(labels, predictions):
    return recall_score(labels, predictions, average='micro')


result, model_outputs, wrong_predictions = model.eval_model(validation_df, f1=f1_multiclass, acc=accuracy_score,
                                                            precision=precision_multiclass, recall=recall_multiclass)

print(result)
for wrong_prediction in wrong_predictions:
    print(wrong_prediction)