In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
train_data = load_dataset("imdb", split="train")
train_data = train_data.shard(num_shards=4, index=0)

test_data = load_dataset("imdb", split="test")
test_data = test_data.shard(num_shards=4, index=0)

<br/><br/><br/>

### Tokenizing Data

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
tokenized_training_data = tokenizer(
    train_data["text"],
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=64,
)

tokenized_test_data = tokenizer(
    test_data["text"],
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=64,
)

In [None]:
def tokenize_function(text_data):
  return tokenizer(
      text_data["text"],
      return_tensors="pt",
      padding=True,
      truncation=True,
      max_length=64,
  )
  
# Tokenize in batches
tokenized_in_batched = train_data.map(tokenize_function, batched=True)

# Tokenize row by row
tokenized_in_row = train_data.map(tokenize_function, batched=False)

<br/><br/><br/>

### Fine-tuning through training

In [None]:
from transformers import Trainer, TrainingArguments

trainig_args = TrainingArguments(
    output_dir="./finetuned",
    # evaluation_strategy="epoch",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
  model=model,
  args=trainig_args,
  train_dataset=tokenized_training_data,
  eval_dataset=tokenized_test_data,
  tokenizer=tokenizer,
)

trainer.train()

In [None]:
import torch

new_data = ["This movie was disappointing", "This is the best movie ever!"]

new_input = tokenizer(
    new_data,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=64,
)

with torch.no_grad():
    outputs = model(**new_input)

predicted_labels = torch.argmax(outputs.logits, dim=1).to_list()

label_map = {
    0: "negative",
    1: "positive",
}

for i, text in enumerate(new_data):
    sentiment = label_map[predicted_labels]
    print(f"\nInput Text {i+1}: {new_data[i]}")
    print(f"\nPredicted Label: {sentiment}")

In [None]:
model.save_pretrained("my_finetuned_files")
tokenizer.save_pretrained("my_finetuned_files")

In [None]:
# Loading a saved model
model = AutoModelForSequenceClassification.from_pretrained("my_finetuned_files")
tokenizer = AutoTokenizer.from_pretrained("my_finetuned_files")