##Data Loading and Initial Exploration

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
#downsampling
#separating spam and ham emails and creating a new dataset
df_spam=df[df['Category']=='spam']
df_spam.shape

(747, 2)

In [None]:
df_ham=df[df['Category']=='ham']
df_ham.shape

(4825, 2)

In [None]:
df_ham_downsampled=df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [None]:
df_balanced=pd.concat([df_spam, df_ham_downsampled])
df_balanced.shape

(1494, 2)

In [None]:
df_balanced.head()

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."


In [None]:
df_balanced.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
spam,747
ham,747


In [None]:
df_balanced['Category_num']=df_balanced.Category.map({
    'spam': 1,
    'ham': 0
})

In [None]:
df_balanced.head()

Unnamed: 0,Category,Message,Category_num
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1


Lets do some preprocessing on text removing stop words and punctuations

I'll be using spacy

In [None]:
import spacy

In [None]:
nlp=spacy.load("en_core_web_sm")


In [None]:
def preprocess(text):
  doc=nlp(text)
  filtered_text=[]

  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_text.append(token.lemma_)
  return " ".join(filtered_text)

In [None]:
df_balanced['preprocessed_text']=df_balanced.Message.apply(preprocess)

In [None]:
df_balanced.head()

Unnamed: 0,Category,Message,Category_num,preprocessed_text
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win FA Cup final tkts 2...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,FreeMsg hey darle 3 week word like fun tb ok X...
8,spam,WINNER!! As a valued network customer you have...,1,WINNER value network customer select receivea ...
9,spam,Had your mobile 11 months or more? U R entitle...,1,mobile 11 month u r entitle update late colour...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",1,"chance win cash 100 20,000 pound txt > CSH11 s..."


In [None]:
df_preprocessed=df_balanced[['preprocessed_text','Category_num']]
df_preprocessed.head()

Unnamed: 0,preprocessed_text,Category_num
2,free entry 2 wkly comp win FA Cup final tkts 2...,1
5,FreeMsg hey darle 3 week word like fun tb ok X...,1
8,WINNER value network customer select receivea ...,1
9,mobile 11 month u r entitle update late colour...,1
11,"chance win cash 100 20,000 pound txt > CSH11 s...",1


In [None]:
df_preprocessed.to_csv('preprocessed_dataset.csv', index=False)

In [None]:
df_preprocessed.shape

(1494, 2)

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
print(tf.__version__)
from transformers import __version__ as transformers_version
print(transformers_version)

2.17.0
4.44.2


In [None]:
#Tokenize and encode the data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
# Split the data into training, validation, and test sets
from sklearn.model_selection import train_test_split
data_texts = df_balanced['preprocessed_text'].to_list()
data_labels = df_balanced['Category_num'].to_list()
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data_texts, data_labels, test_size=0.01, random_state=0
)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=0
)


In [None]:
# Intialize the bert model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#tokenize datasets
# Tokenize the text data using BERT tokenizer
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="tf")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="tf")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="tf")

In [None]:
#creating tensorflow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(64)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(64)


In [None]:
# Compiling the Model
# Set up the optimizer, loss function, and evaluation metrics for model training
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, weight_decay=0.0001)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Callbacks for saving the model and logging
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(filepath='./results', save_weights_only=True, monitor='val_loss', mode='min', save_best_only=True),
    tf.keras.callbacks.TensorBoard(log_dir='./logs', update_freq=100)
]



In [None]:
# Train the model
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=7,
    callbacks=callbacks
)

# Evaluate the model on the test set
results = model.evaluate(test_dataset)
print(f"Test Results: {results}")

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Test Results: [0.05614425241947174, 0.9333333373069763]


###Saving the Trained Model and Tokenizerfor future use

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Save the trained model and tokenizer
save_directory="/content/drive/MyDrive/Email classification bert/ModelsAndTokenizer"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/Email classification bert/ModelsAndTokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Email classification bert/ModelsAndTokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Email classification bert/ModelsAndTokenizer/vocab.txt',
 '/content/drive/MyDrive/Email classification bert/ModelsAndTokenizer/added_tokens.json')