# dataset

In [30]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("yelp_review_full")
print(type(dataset))
print(dataset.keys())

sub_train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
sub_test_dataset = dataset["test"].shuffle(seed=42).select(range(1000))

<class 'datasets.dataset_dict.DatasetDict'>
dict_keys(['train', 'test'])


In [None]:
sub_train_dataset[100]

{'label': 3,
 'text': "I recently brough my car up to Edinburgh from home, where it had sat on the drive pretty much since I had left home to go to university.\\n\\nAs I'm sure you can imagine, it was pretty filthy, so I pulled up here expecting to shell out \\u00a35 or so for a crappy was that wouldnt really be that great.\\n\\nNeedless to say, when I realised that the cheapest was was \\u00a32, i was suprised and I was even more suprised when the car came out looking like a million dollars.\\n\\nVery impressive for \\u00a32, but thier prices can go up to around \\u00a36 - which I'm sure must involve so many polishes and waxes and cleans that dirt must be simply repelled from the body of your car, never getting dirty again."}

# tokenizer

In [34]:
from transformers import AutoTokenizer

In [35]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [55]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map: 100%|██████████| 650000/650000 [01:41<00:00, 6381.27 examples/s]
Map: 100%|██████████| 50000/50000 [00:07<00:00, 6469.37 examples/s]


In [56]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))


In [57]:
len(set(small_train_dataset["label"]))

5

# model

In [75]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-cased", num_labels=5, torch_dtype="auto"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# pipeline

In [76]:
from transformers import pipeline


In [77]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)


Device set to use mps:0


In [78]:
result = classifier("This restaurant has excellent service and great food!")
print(result)


[{'label': 'LABEL_0', 'score': 0.31116628646850586}]


# train

## arguments

In [60]:
from transformers import TrainingArguments

In [61]:
training_args = TrainingArguments(output_dir="test_trainer")

## evaluate

In [62]:
import numpy as np
import evaluate



In [63]:
metric = evaluate.load("accuracy")

In [64]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [72]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", 
                                  eval_strategy="epoch",
                                  num_train_epochs=0.01,
                                  )

## trainer

In [73]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)


In [74]:
trainer.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
model.save_pretrained("test_trainer")
tokenizer.save_pretrained("test_trainer")