In [1]:
!pip install transformers datasets evaluate
!pip install accelerate -U
!pip install transformers[torch]
!pip install torch -U

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloadin

In [1]:
from huggingface_hub import notebook_login

In [43]:
from datasets import load_dataset

inout_training_set = load_dataset("davidgaofc/Shadow_inout")
shuffle_set = inout_training_set.shuffle(seed=51)
training_set = shuffle_set['train'].train_test_split(test_size=0.2)

In [44]:
from transformers import AutoTokenizer

In [45]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [46]:
def preprocess_function(examples):
  return tokenizer(examples["Text"], truncation = True)

def clean_function(examples):
  examples['label'] = examples['Label']
  return examples

In [47]:
temp_tokenized_dataset = training_set.map(preprocess_function, batched = True)
tokenized_dataset = temp_tokenized_dataset.map(clean_function, batched = True)
tokenized_dataset

Map:   0%|          | 0/1312 [00:00<?, ? examples/s]

Map:   0%|          | 0/328 [00:00<?, ? examples/s]

Map:   0%|          | 0/1312 [00:00<?, ? examples/s]

Map:   0%|          | 0/328 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Text', 'Label', 'input_ids', 'attention_mask', 'label'],
        num_rows: 1312
    })
    test: Dataset({
        features: ['Text', 'Label', 'input_ids', 'attention_mask', 'label'],
        num_rows: 328
    })
})

In [48]:
from transformers import DataCollatorWithPadding

In [49]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [50]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
f1_score = load_metric("f1")
precision_met = load_metric("precision")
recall_met = load_metric("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric.compute(predictions=predictions, references=labels)
    f1 = f1_score.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_met.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_met.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"], "precision": precision['precision'], "recall": recall['recall']}

In [51]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
training_args = TrainingArguments(output_dir="training", learning_rate=2e-5, weight_decay=0.01,
    num_train_epochs=11,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics,
)

In [64]:
import torch

torch.cuda.empty_cache()


In [73]:
trainer.train(resume_from_checkpoint="training/checkpoint-330")

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
6,No log,3.756255,0.487805,0.48262,0.489336,0.487805
7,No log,3.671934,0.509146,0.50853,0.510049,0.509146
8,0.009400,3.731884,0.506098,0.500461,0.508634,0.506098


KeyboardInterrupt: ignored

In [71]:
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
5,No log,3.69403,0.503049,0.490316,0.506819,0.503049


{'eval_loss': 3.6940295696258545,
 'eval_accuracy': 0.5030487804878049,
 'eval_f1': 0.49031589079474264,
 'eval_precision': 0.5068194283133308,
 'eval_recall': 0.5030487804878049}

In [21]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [72]:
trainer.push_to_hub()


events.out.tfevents.1700947391.df68e7d6d157.1716.6:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

events.out.tfevents.1700947570.df68e7d6d157.1716.7:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

'https://huggingface.co/davidgaofc/training/tree/main/'