In [1]:
from transformers import AutoModelForSequenceClassification,Trainer,AutoTokenizer,TrainingArguments

In [2]:
from datasets import load_dataset

In [3]:
data = load_dataset('sms_spam',split='train').train_test_split(test_size=0.3,seed=23,shuffle=True)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [5]:
splits= ['train','test']

In [7]:
tokened = {}
for split in splits:
    tokened[split] = data[split].select(range(500)).map(lambda x: tokenizer(x['sms']), batched=True)
    

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
for params in model.parameters():
    params.requires_grad=True

In [15]:
def compute(x):
    preds, acts = x
    return {'accuracy':(acts == preds).mean()}

In [16]:
trainer = Trainer(model=model,
                  args=TrainingArguments(
                      output_dir = './sms',
                      learning_rate=2e-5,
                      num_train_epochs=1,
                      weight_decay=0.01),
                  train_dataset=tokened['train'],
                  eval_dataset=tokened['test'],
                  tokenizer=tokenizer,
                  )

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [17]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=63, training_loss=0.07640096119471959, metrics={'train_runtime': 93.22, 'train_samples_per_second': 5.364, 'train_steps_per_second': 0.676, 'total_flos': 6728929891104.0, 'train_loss': 0.07640096119471959, 'epoch': 1.0})

In [18]:
trainer.evaluate()


{'eval_loss': 0.0804712101817131,
 'eval_runtime': 19.4635,
 'eval_samples_per_second': 25.689,
 'eval_steps_per_second': 3.237,
 'epoch': 1.0}

In [52]:
examples = [1,4,6,78,87,436,75]

In [19]:
data_e = trainer.predict(tokened['test'].select(range))

In [29]:
data_e.predictions

array([[ 2.5231953, -2.9942236],
       [ 2.134394 , -2.411899 ],
       [ 2.51499  , -2.8312001],
       [ 2.525272 , -2.9664283],
       [ 2.5847516, -2.927692 ]], dtype=float32)

In [30]:
preds = data_e.predictions

In [42]:
import numpy as np
np.max(preds,axis=1)

array([2.5231953, 2.134394 , 2.51499  , 2.525272 , 2.5847516],
      dtype=float32)

In [36]:
preds

array([[ 2.5231953, -2.9942236],
       [ 2.134394 , -2.411899 ],
       [ 2.51499  , -2.8312001],
       [ 2.525272 , -2.9664283],
       [ 2.5847516, -2.927692 ]], dtype=float32)

In [44]:
import pandas as pd

In [53]:
import pandas as pd

# Assuming tokened['test'].select(examples) returns a list of tokenized examples
# Assuming trainer is your model trainer
# Assuming examples is defined elsewhere

# Extracting 'sms' and 'label' from tokened['test'] examples
sms_list = [x['sms'] for x in tokened['test'].select(examples)]
label_list = [x['label'] for x in tokened['test'].select(examples)]

# Making predictions using the trainer and extracting argmax
preds = np.argmax(trainer.predict(tokened['test'].select(examples)).predictions, axis=1)

# Creating DataFrame
df = pd.DataFrame({'sms': sms_list,
                   'labels': label_list,
                   'preds': preds})


In [60]:
df.iloc[3]

sms       Fuck babe ... I miss you already, you know ? C...
labels                                                    0
preds                                                     0
Name: 3, dtype: object