In [1]:
!pip install pandas
!pip install numpy
!pip install torch
!pip install scikit-learn
!pip install tensorflow

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [22]:
import pandas as pd
import numpy as np
import re
import requests
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, TFBertForSequenceClassification
from torch.utils.data import Dataset,DataLoader
import tensorflow as tf
import torch
import torch.nn as nn # Importing the torch.nn module and aliasing it as nn
import torch.optim as optim # Importing the torch.optim module

In [3]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
# Loading dataset
dataset_path = "/SMSSmishCollection.txt"
with open(dataset_path, 'r', encoding='utf-8', errors='replace') as file:
    print(file.readlines()[:5])

['ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'ham\tOk lar... Joking wif u oni...\n', "smish\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n", 'ham\tU dun say so early hor... U c already then say...\n', "ham\tNah I don't think he goes to usf, he lives around here though\n"]


In [6]:
df = pd.read_csv(dataset_path, sep='\t', names=['label', 'text'], encoding='utf-8', on_bad_lines='skip')

In [15]:
# Converting labels
df['label'] = df['label'].map({'smish': 1, 'ham': 0})

In [8]:
# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', ' <URL> ', text)  # Replace URLs with <URL>
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['cleaned_text'] = df['text'].apply(clean_text)


In [9]:
# Tokenization
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()


In [10]:
# Encode labels
y = np.array(df['label'])


In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# PyTorch Dataset
class SMSDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = SMSDataset(X_train, y_train)
test_dataset = SMSDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [23]:
# LSTM Model
class SmishingLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout):
        super(SmishingLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.unsqueeze(1)  # Add sequence length dimension
        lstm_out, _ = self.lstm(x)
        x = self.fc1(lstm_out[:, -1, :])
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x)

model = SmishingLSTM(input_dim=X_train.shape[1], hidden_dim=128, num_layers=2, output_dim=1, dropout=0.3).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [24]:
# Training loop
epochs = 15
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs.squeeze(), y_batch.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(train_loader):.4f}')


Epoch [1/15], Loss: 0.2665
Epoch [2/15], Loss: 0.0419
Epoch [3/15], Loss: 0.0130
Epoch [4/15], Loss: 0.0071
Epoch [5/15], Loss: 0.0043
Epoch [6/15], Loss: 0.0031
Epoch [7/15], Loss: 0.0026
Epoch [8/15], Loss: 0.0020
Epoch [9/15], Loss: 0.0018
Epoch [10/15], Loss: 0.0014
Epoch [11/15], Loss: 0.0016
Epoch [12/15], Loss: 0.0018
Epoch [13/15], Loss: 0.0016
Epoch [14/15], Loss: 0.0015
Epoch [15/15], Loss: 0.0013


In [25]:
# Evaluation
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch).squeeze()
        predicted = (outputs > 0.5).float()
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

accuracy = correct / total if total > 0 else 0
print(f'Test Accuracy: {accuracy:.2f}')


Test Accuracy: 0.99


In [29]:
# Prediction function
def predict_sms(sms):
    cleaned_sms = clean_text(sms)
    vectorized_sms = vectorizer.transform([cleaned_sms]).toarray()
    sms_tensor = torch.tensor(vectorized_sms, dtype=torch.float32).to(device)
    with torch.no_grad():
        prediction = model(sms_tensor).item()
    return 'Smishing' if prediction > 0.5 else 'Ham'


In [27]:
# Test prediction
sample_sms = 'Urgent! Your account has been compromised. Visit http://fake-bank.com now.'
print(f'Message: "{sample_sms}" => Prediction: {predict_sms(sample_sms)}')

Message: "Urgent! Your account has been compromised. Visit http://fake-bank.com now." => Prediction: Smishing
