### Fine tuning a model for classification
A walkthrough from https://huggingface.co/blog/sentiment-analysis-python

In [1]:
import torch



In [2]:
from datasets import load_dataset
imdbdata = load_dataset('imdb')

# Subset for faster training
small_train_dataset = imdbdata["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdbdata["test"].shuffle(seed=42).select([i for i in list(range(300))])

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased',device='mps')
def tokenize_function(example):
    return tokenizer(example['text'],padding='max_length',truncation=True)

In [11]:
tokenized_train = small_train_dataset.map(tokenize_function,batched=True)

In [12]:
tokenized_test = small_test_dataset.map(tokenize_function,batched=True)

"To speed up training, let's use a data_collator to convert your training samples to PyTorch tensors and concatenate them with the correct amount of padding:"

In [6]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
# Define base model before  fine-tuning:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Define methods for evaluating model performance:
import numpy as np
from datasets import load_metric
 
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred # Unpack
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [9]:
# Log into Hugging Face:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
from transformers import TrainingArguments, Trainer
 
repo_name = "finetuning-sentiment-model-3000-samples"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

TypeError: 'int' object is not callable

In [14]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=376, training_loss=0.27830201006950217, metrics={'train_runtime': 2772.1764, 'train_samples_per_second': 2.164, 'train_steps_per_second': 0.136, 'total_flos': 794804391936000.0, 'train_loss': 0.27830201006950217, 'epoch': 2.0})

In [22]:
trainer.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


AttributeError: 'float' object has no attribute 'size'

In [16]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ghpetty/finetuning-sentiment-model-3000-samples/commit/690eb07d4d49ed0157042ea27cbe8698d915d100', commit_message='End of training', commit_description='', oid='690eb07d4d49ed0157042ea27cbe8698d915d100', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
from transformers import pipeline
 
sentiment_model = pipeline(model="ghpetty/finetuning-sentiment-model-3000-samples")
sentiment_model(["I love this move", "This movie sucks!"])

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

RuntimeError: The model ghpetty/finetuning-sentiment-model-3000-samples does not seem to have a correct `pipeline_tag` set to infer the task automatically