In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from torch.utils.data import Dataset, DataLoader
import torch


In [2]:
from google.colab import files
files.upload()  # Upload kaggle.json


Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"divyass","key":" "}'}

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!pip install -q kaggle
!kaggle datasets download -d abdallahwagih/spam-emails
!unzip spam-emails.zip -d spam_emails


Dataset URL: https://www.kaggle.com/datasets/abdallahwagih/spam-emails
License(s): apache-2.0
spam-emails.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  spam-emails.zip
replace spam_emails/spam.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [5]:
import pandas as pd

df = pd.read_csv("spam_emails/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df['Message'] = df['Message'].apply(lambda x: x.lower())
df['Category'] = df['Category'].map({'spam':0, 'ham':1})
df

Unnamed: 0,Category,Message
0,1,"go until jurong point, crazy.. available only ..."
1,1,ok lar... joking wif u oni...
2,0,free entry in 2 a wkly comp to win fa cup fina...
3,1,u dun say so early hor... u c already then say...
4,1,"nah i don't think he goes to usf, he lives aro..."
...,...,...
5567,0,this is the 2nd time we have tried 2 contact u...
5568,1,will ü b going to esplanade fr home?
5569,1,"pity, * was in mood for that. so...any other s..."
5570,1,the guy did some bitching but i acted like i'd...


In [7]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Message'], df['Category'], test_size=0.2)

model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

In [9]:
class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = EmailDataset(train_encodings, train_labels.tolist())
test_dataset = EmailDataset(test_encodings, test_labels.tolist())

In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100
)


In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Step,Training Loss
100,0.0396
200,0.0413
300,0.0313
400,0.0681


Step,Training Loss
100,0.0396
200,0.0413
300,0.0313
400,0.0681
500,0.0471
600,0.0454
700,0.0273
800,0.0229
900,0.0562
1000,0.0205


TrainOutput(global_step=1674, training_loss=0.027708286533561693, metrics={'train_runtime': 17237.7792, 'train_samples_per_second': 0.776, 'train_steps_per_second': 0.097, 'total_flos': 764531193011508.0, 'train_loss': 0.027708286533561693, 'epoch': 3.0})

In [21]:
# Save the model and tokenizer
trainer.save_model('./spam_classifier_model')
tokenizer.save_pretrained('./spam_classifier_model')

('./spam_classifier_model/tokenizer_config.json',
 './spam_classifier_model/special_tokens_map.json',
 './spam_classifier_model/vocab.txt',
 './spam_classifier_model/added_tokens.json')

In [23]:
results = trainer.evaluate()

print(results)

{'eval_loss': 0.0403742678463459, 'eval_runtime': 472.4757, 'eval_samples_per_second': 2.36, 'eval_steps_per_second': 0.148, 'epoch': 3.0}


In [24]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import pipeline

model = DistilBertForSequenceClassification.from_pretrained('./spam_classifier_model')
tokenizer = DistilBertTokenizer.from_pretrained('./spam_classifier_model')
# Create a text classification pipeline
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Your test sentence
test_sentence = "You are a winner you have been specially. Selected to receive $1000 cash or a $2000 award."

# Get prediction
result = classifier(test_sentence)

print(result)

Device set to use cpu


[{'label': 'LABEL_0', 'score': 0.9992455244064331}]


In [26]:
## without using pipeline
inputs = tokenizer("Hey, just wanted to check if we're still on, for dinner tonight? Let me know!", return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)
logits = outputs.logits
probs = torch.softmax(logits, dim=1)
predicted_class = torch.argmax(probs).item()

print(f"Predicted class: {predicted_class} with probability {probs[0][predicted_class].item():.4f}")

Predicted class: 1 with probability 1.0000
