<a href="https://colab.research.google.com/github/elhamod/IS883/blob/main/Fine_tuning_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Yelp review modeling

We will use the `yelp_review_full` dataset and create a model that takes the review in and give me a 5-star rating.

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base

Load the dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

Show dataset structure

In [None]:
dataset

Show an example

In [None]:
dataset["train"][100]

##Step1:

Tokenize the dataset

In [None]:
from transformers import AutoTokenizer

# Create the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") # https://huggingface.co/transformers/v3.3.1/pretrained_models.html

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Get a small subset of the dataset
sampled_dataset_train = dataset["train"].shuffle(seed=42).select(range(1000))
sampled_dataset_test = dataset["test"].shuffle(seed=42).select(range(1000))

# Tokenize the dataset
tokenized_dataset_train = sampled_dataset_train.map(tokenize_function, batched=True)
tokenized_dataset_test = sampled_dataset_test.map(tokenize_function, batched=True)

In [None]:
tokenized_dataset_train

##Step 2:

Train the model

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments

#Create the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Setup a metric to evaluate the model

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Train!

In [None]:
from transformers import TrainingArguments, Trainer

# Setup the trainer
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="steps", eval_steps=75)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    compute_metrics=compute_metrics,
)

trainer.train()

## Step 3:

Save the model

In [None]:
trainer.save_model("./yelp_classifier")

##Step 4: Things to play with:

- Train from scratch.
- Hyper-parameter tuning.
- Study the dataset.
- Try a different model.
- Why is there no train logging?
- Load the model.
- Test the model.
