## BERT-based classification

In [None]:
%pip install torch
%pip install accelerate --upgrade

In [None]:
import torch
print('Cuda Avaliable: ', torch.cuda.is_available())

In [None]:
%pip install datasets transformers evaluate huggingface_hub

In [None]:
import csv
import collections
import numpy as np
import random

import evaluate

from datasets import Dataset
from transformers import TrainingArguments, Trainer

In [None]:
classes=('ham', 'spam')

dataset = []

with open("../examples/SMSSpamCollection.txt", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile, delimiter="\t")
    for i, row in enumerate(reader):
        if row["class"] == classes[1]:
            dataset.append({'text': row["text"], 'label':1})
        else:
            dataset.append({'text': row["text"], 'label':0})

random.shuffle(dataset)
train = Dataset.from_list(dataset[:int(len(dataset)*.8)])
test  = Dataset.from_list(dataset[int(len(dataset)*.8):])

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)

tokenized_train = train.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
import numpy as np

def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
repo_name = "my_awesome_model"

training_args = TrainingArguments(
   report_to="none",
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=False,
   )

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()