# SMS Fraudulent Classification

# A. Load Dataset

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("tinu10kumar/sms-spam-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/sms-spam-dataset


In [2]:
import pandas as pd

df = pd.read_csv("/kaggle/input/sms-spam-dataset/combined_dataset.csv", encoding='latin-1')

In [3]:
df.head(100)

Unnamed: 0,target,text
0,spam,Congratulations! You've been selected for a lu...
1,spam,URGENT: Your account has been compromised. Cli...
2,spam,You've won a free iPhone! Claim your prize by ...
3,spam,Act now and receive a 50% discount on all purc...
4,spam,Important notice: Your subscription will expir...
...,...,...
95,spam,Unlock access to our members-only portal. Join...
96,spam,"Hurry, our anniversary sale ends soon. Shop no..."
97,spam,Important notice: Your subscription is about t...
98,spam,Special deal just for you: Use code SPECIAL10 ...


In [4]:
df.shape

(10961, 2)

# B. Preprocessing Dataset
* Clean the text data by removing unwanted characters, punctuation, and ensuring consistent casing.

* Text tokenization: Convert text into sequences of words or subwords.

* Label Encoding: Convert the labels (spam, ham) into numerical values (0 for ham, 1 for spam).

In [5]:
# clean the text data by removing unwanted characters, punctions and ensuring consistent casing
import re
import string
from sklearn.preprocessing import LabelEncoder

#preprocessing function
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(f"[{string.punctuation}]", " ", text)  # remove punctuation
    text = re.sub(r"\d+", " ", text)  # remove numbers
    text = text.strip()  # remove extra spaces
    return text

# clean the text column
df["text"] = df["text"].apply(clean_text)

# encode target labels, "ham = 0", "spam = 1"
le = LabelEncoder()
df["label"] = le.fit_transform(df["target"])

In [6]:
import random
# Get a random number of rows between 1 and 100
n_random_rows = random.randint(1, 100)
# Display a random sample of rows using the sample() method
display(df.sample(n=n_random_rows))

Unnamed: 0,target,text,label
2589,spam,we ve found a school for you,1
865,spam,get back to me please once south came again he...,1
3975,ham,assignment termination expiration report ...,0
8613,ham,that s fine have him give me a call if he kno...,0
4717,ham,deal should we keep rolling this deal,0
...,...,...,...
1715,ham,hpl nom for august see attached file ...,0
10756,ham,yes princess i want to please you every night...,0
3670,spam,off for all new software darned cannibalized...,1
4301,spam,,1


# tokenization and text-vectorization with transformers

In [7]:
!pip install tensorflow transformers



In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

max_length = 64

def encode_text(text):
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length= max_length,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="tf",
    )
encoded_text = df["text"].apply(lambda x : encode_text(x))
encoded_text[0]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

2025-10-05 23:36:05.664194: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759707366.040575      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759707366.146322      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1759707381.770779      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1759707381.771539      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability:

# model definition and training

In [9]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2, use_safetensors=False)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy("accuracy")],
    run_eagerly=True # Add this line
)

tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# prepare trainig and validation data

In [11]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer

# Assuming the tokenizer is defined and max_length is set in a previous cell
# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# max_length = 64

def encode_text(text):
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length= max_length,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="tf",
    )
    return tokens

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Tokenize the training and validation data
train_encodings = X_train.apply(lambda x: encode_text(x))
val_encodings = X_val.apply(lambda x: encode_text(x))

# Convert the Series of encoding dictionaries into dictionaries of arrays with string keys
def convert_to_dict_of_arrays(encodings):
    input_ids = tf.concat([x['input_ids'] for x in encodings], axis=0)
    attention_mask = tf.concat([x['attention_mask'] for x in encodings], axis=0)
    # Include token_type_ids if your model uses them and the tokenizer provides them
    # token_type_ids = tf.concat([x['token_type_ids'] for x in encodings], axis=0)
    return {'input_ids': input_ids, 'attention_mask': attention_mask}#, 'token_type_ids': token_type_ids}

train_input_dict = convert_to_dict_of_arrays(train_encodings)
val_input_dict = convert_to_dict_of_arrays(val_encodings)


# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_input_dict, y_train.values))
val_dataset = tf.data.Dataset.from_tensor_slices((val_input_dict, y_val.values))

# Batch the datasets
train_dataset = train_dataset.batch(16).shuffle(1000)
val_dataset = val_dataset.batch(16)

In [12]:
# train the model
history = model.fit(
    train_dataset,
    epochs=3,
    validation_data=val_dataset,
)

Epoch 1/3


I0000 00:00:1759707428.047915      19 service.cc:148] XLA service 0x58126800 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1759707428.049430      19 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1759707428.049450      19 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1759707428.173812      19 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1759707428.446272      19 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3


In [13]:
# Evaluate the model on the validation set
val_loss, val_accuracy = model.evaluate(val_dataset)
print(f'Validation Accuracy: {val_accuracy*100:.2f}%')


Validation Accuracy: 97.54%


In [14]:
from sklearn.metrics import confusion_matrix, classification_report

# Predict on validation data
y_pred = model.predict(val_dataset)
y_pred_labels = tf.argmax(y_pred.logits, axis=1)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
report = classification_report(y_val, y_pred_labels)
print(report)


Confusion Matrix:
[[1696   10]
 [  44  443]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1706
           1       0.98      0.91      0.94       487

    accuracy                           0.98      2193
   macro avg       0.98      0.95      0.96      2193
weighted avg       0.98      0.98      0.98      2193



In [15]:
# Save the trained model
model.save_pretrained("sms_spam_model")
tokenizer.save_pretrained("sms_spam_model")


('sms_spam_model/tokenizer_config.json',
 'sms_spam_model/special_tokens_map.json',
 'sms_spam_model/vocab.txt',
 'sms_spam_model/added_tokens.json')