In [1]:
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
from transformers import TextClassificationPipeline
from datasets import Dataset
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate
import pandas as pd

In [2]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
metric = evaluate.load("accuracy")

In [4]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [5]:
spamData = pd.read_csv("Data/spam.csv", encoding = "latin1")

In [6]:
spamData = spamData.rename(columns={"v2": "text"})
spamData = spamData.rename(columns={"v1": "label"})

In [7]:
spamData["text"] = spamData["text"].astype(str)

In [8]:
spamData["label"].value_counts()

label
ham     4842
spam     771
Name: count, dtype: int64

In [9]:
spamData.replace(to_replace = "spam", value = 1, inplace = True)
spamData.replace(to_replace = "ham", value = 0, inplace = True)

In [10]:
grouped_data = spamData.groupby("label", group_keys=False).apply(lambda x: x.sample(771))

In [11]:
grouped_data["label"].value_counts()

label
0    771
1    771
Name: count, dtype: int64

In [12]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
train_dataset, eval_dataset = train_test_split(grouped_data, test_size = 0.07, random_state = 55)
train_dataset.reset_index(drop = True, inplace = True)
eval_dataset.reset_index(drop = True, inplace = True)

In [14]:
train_dataset["label"].value_counts()

label
0    725
1    709
Name: count, dtype: int64

In [15]:
eval_dataset["label"].value_counts()

label
1    62
0    46
Name: count, dtype: int64

In [16]:
train_hf_dataset = Dataset.from_pandas(train_dataset)
eval_hf_dataset = Dataset.from_pandas(eval_dataset)

In [17]:
train_data = train_hf_dataset.map(tokenize_function, batched=True)
eval_data = eval_hf_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1434 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

In [18]:
train_data

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1434
})

In [19]:
training_args = TrainingArguments(
    output_dir = "test_trainer", 
    eval_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 10,
    per_device_eval_batch_size = 10,
    weight_decay = 0.01,
    use_cpu = True
)

In [20]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_data,
    eval_dataset = eval_data,
    compute_metrics = compute_metrics
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.037254,0.990741
2,No log,0.008351,1.0
3,No log,0.001355,1.0


TrainOutput(global_step=432, training_loss=0.08411250291047273, metrics={'train_runtime': 3678.3041, 'train_samples_per_second': 1.17, 'train_steps_per_second': 0.117, 'total_flos': 282975940039680.0, 'train_loss': 0.08411250291047273, 'epoch': 3.0})

In [22]:
trainer.evaluate()

{'eval_loss': 0.0013554609613493085,
 'eval_accuracy': 1.0,
 'eval_runtime': 4.6389,
 'eval_samples_per_second': 23.281,
 'eval_steps_per_second': 2.371,
 'epoch': 3.0}

In [23]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k = None)
pipe("Hello")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[[{'label': 'LABEL_0', 'score': 0.9997345805168152},
  {'label': 'LABEL_1', 'score': 0.0002653638366609812}]]

In [24]:
def model_prediction(text):
    predictions_hud = pipe(text)
    if predictions_hud[0][0]["label"] == "LABEL_1":
        return True
    else:
        return False