In [2]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, DataLoader


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [12]:
df = pd.read_csv("/SMSSmishCollection.txt", sep='\t', names=['label', 'text'], encoding='utf-8', on_bad_lines='skip')
print (df)
df['label'] = df['label'].map({'smish': 1, 'ham': 0})

      label                                               text
0       ham  Go until jurong point, crazy.. Available only ...
1       ham                      Ok lar... Joking wif u oni...
2     smish  Free entry in 2 a wkly comp to win FA Cup fina...
3       ham  U dun say so early hor... U c already then say...
4       ham  Nah I don't think he goes to usf, he lives aro...
...     ...                                                ...
5567  smish  This is the 2nd time we have tried 2 contact u...
5568    ham               Will ü b going to esplanade fr home?
5569    ham  Pity, * was in mood for that. So...any other s...
5570    ham  The guy did some bitching but I acted like i'd...
5571    ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', ' <URL> ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

df['cleaned_text'] = df['text'].apply(clean_text)


In [14]:
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
class SMSDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [16]:
train_loader = DataLoader(SMSDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader = DataLoader(SMSDataset(X_test, y_test), batch_size=32)


In [17]:
class SmishingLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2, dropout=0.3):
        super(SmishingLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim, 128)
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.unsqueeze(1)
        lstm_out, _ = self.lstm(x)
        x = self.fc1(lstm_out[:, -1, :])
        x = self.fc2(x)
        return self.sigmoid(x)


In [18]:
model = SmishingLSTM(input_dim=X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [19]:
for epoch in range(15):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch).squeeze()
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch [{epoch+1}/15], Loss: {total_loss/len(train_loader):.4f}')


Epoch [1/15], Loss: 0.2325
Epoch [2/15], Loss: 0.0383
Epoch [3/15], Loss: 0.0121
Epoch [4/15], Loss: 0.0070
Epoch [5/15], Loss: 0.0043
Epoch [6/15], Loss: 0.0035
Epoch [7/15], Loss: 0.0035
Epoch [8/15], Loss: 0.0024
Epoch [9/15], Loss: 0.0024
Epoch [10/15], Loss: 0.0020
Epoch [11/15], Loss: 0.0019
Epoch [12/15], Loss: 0.0019
Epoch [13/15], Loss: 0.0018
Epoch [14/15], Loss: 0.0017
Epoch [15/15], Loss: 0.0014


In [20]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch).squeeze()
        predicted = (outputs > 0.5).float()
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)

accuracy = correct / total
print(f'Test Accuracy: {accuracy:.2f}')


Test Accuracy: 0.97
