In [1]:
import json
import os
import sys
from datetime import datetime
import pandas as pd
import torch
from torch import BoolTensor
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, EarlyStoppingCallback
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

MODEL_NAME = 'distilbert-base-uncased'
MAX_LENGTH = 512

FILENAME_TEST = 'test.csv'
DIR_OUTPUT = 'results'

DEVICE_DEFAULT = 'cuda'

def get_ts():
    return datetime.utcnow().replace(microsecond=0).isoformat()
# end


class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)
    # end
# end

def read_passages(path_data, path_label, test_size=0):
    df = pd.read_csv(path_data)[:200]

    documents = df['processed'].to_list()
    labels_str = df['target'].to_list()

    samples = documents

    with open(path_label, 'r') as file:
        labels_list = sorted(json.load(file))
    # end

    labels_all = {l: idx for idx, l in enumerate(labels_list)}

    labels = [labels_all[label_str] for label_str in labels_str]

    if test_size > 0:
        return train_test_split(samples, labels, test_size=test_size, stratify=labels, random_state=234), labels_list
    else:
        return (samples, samples, labels, labels), labels_list
    # end
# end




In [2]:

path_train = os.path.join('.','test.csv')
path_label = os.path.join('.', 'label.json')

print('[{}] start main_train_and_evaluate with {} {}'.format(get_ts(), path_train, path_label))

model_name = MODEL_NAME
max_length = MAX_LENGTH
output_dir = DIR_OUTPUT

(train_samples, valid_samples, train_labels, valid_labels), target_names = read_passages(path_train, path_label,
                                                                                            0.1)

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
train_encodings = tokenizer.batch_encode_plus(train_samples, truncation=True, padding=True, max_length=max_length,
                                                return_tensors='pt')
valid_encodings = tokenizer.batch_encode_plus(valid_samples, truncation=True, padding=True, max_length=max_length,
                                                return_tensors='pt')

train_dataset = SimpleDataset(train_encodings, train_labels)
valid_dataset = SimpleDataset(valid_encodings, valid_labels)

model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))

  return datetime.utcnow().replace(microsecond=0).isoformat()


[2025-04-16T07:53:41] start main_train_and_evaluate with .\test.csv .\label.json


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [26]:
def compute_metrics(pred): # pred:  ['count', 'index', 'label_ids', 'predictions']
    labels = pred.label_ids.reshape(-1)
    preds = pred.predictions.argmax(-1).reshape(-1)

    print('jinyuj: len of labels: {}'.format(len(labels)))
    print('jinyuj: len of labels: {}'.format(len(preds)))

    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    precision = precision_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    recall = recall_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    f1 = f1_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
# end


In [None]:

training_args = TrainingArguments(
    output_dir=output_dir,  # output directory
    num_train_epochs=1,  # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,  # batch size for evaluation
    warmup_steps=0,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    # load the best model when finished training (default metric is loss)    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    # logging_steps=1,  # log & save weights each logging_steps
    evaluation_strategy="epoch",  # evaluate each `logging_steps`
    learning_rate=2e-5,
    # save_strategy='epoch',
    metric_for_best_model='f1'
)

# trainer = Trainer(
#     model=model,  # the instantiated Transformers model to be trained
#     args=training_args,  # training arguments, defined above
#     train_dataset=train_dataset,  # training dataset
#     eval_dataset=valid_dataset,  # evaluation dataset
#     compute_metrics=compute_metrics,  # the callback that computes metrics of interest
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=6)]
# )

trainer = Trainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=valid_dataset,  # evaluation dataset
    compute_metrics=compute_metrics
)

print('[{}] start training...'.format(get_ts()))
trainer.train()

info_state_model = trainer.evaluate()
print('[{}] finish training.'.format(get_ts()))

################## start to do eval ##################




PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
  return datetime.utcnow().replace(microsecond=0).isoformat()
***** Running training *****
  Num examples = 180
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 12


[2025-04-16T08:10:55] start training...


  0%|          | 0/12 [00:00<?, ?it/s]

  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 20
  Batch size = 16


  0%|          | 0/2 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 20
  Batch size = 16


jinyuj: len of labels: 20
jinyuj: len of labels: 20
{'eval_loss': 0.23033574223518372, 'eval_accuracy': 0.95, 'eval_precision': 0.975, 'eval_recall': 0.5, 'eval_f1': 0.48717948717948717, 'eval_runtime': 0.0442, 'eval_samples_per_second': 452.499, 'eval_steps_per_second': 45.25, 'epoch': 1.0}
{'train_runtime': 0.9636, 'train_samples_per_second': 186.794, 'train_steps_per_second': 12.453, 'train_loss': 0.2573344906171163, 'epoch': 1.0}


  item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}


  0%|          | 0/2 [00:00<?, ?it/s]

jinyuj: len of labels: 20
jinyuj: len of labels: 20
[2025-04-16T08:10:56] finish training.


  return datetime.utcnow().replace(microsecond=0).isoformat()
