In [98]:
import pandas as pd
import numpy as np

# Text processing
import re
import string

# ML & evaluation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [99]:
df1 = pd.read_csv("honeypot_messages_5000 (1).csv", encoding="latin1")
df2 = pd.read_csv("sms_dataset_processed.csv")

df1.head(), df2.head()

(                                                Text  scam
 0  Reminder: Your appointment is scheduled for 12...     0
 1  Refund of Rs 7500 is ready. Share your UPI PIN...     1
 2  Thank you for your payment of Rs 999. Transact...     0
 3  Order #254668 has been shipped and will be del...     0
 4  Reminder: Your appointment is scheduled for 1-...     0,
                                                 Text  scam
 0  Your opinion about me? 1. Over 2. Jada 3. Kusr...     0
 1  What's up? Do you want me to come online? If y...     0
 2                       So u workin overtime nigpun?     0
 3  Also sir, i sent you an email about how to log...     0
 4  Please Stay At Home. To encourage the notion o...     1)

In [100]:
df1.tail()

Unnamed: 0,Text,scam
10746,no genuine company gives free prizes,0
10747,government warns against lucky draw scams,0
10748,please stay alert against online fraud,0
10749,do not share login credentials with anyone,0
10750,report suspicious messages immediately,0


In [101]:
print(df1.shape)
print(df1['scam'].value_counts())

(10751, 2)
scam
1    5500
0    5251
Name: count, dtype: int64


In [102]:
print(df2.shape)
print(df2['scam'].value_counts())

(5971, 2)
scam
0    5333
1     638
Name: count, dtype: int64


In [103]:
def clean_text(Text):
    text = Text.lower()
    text = re.sub(r'\d+', '', text)            # remove numbers
    text = re.sub(r'\s+', ' ', text)           # remove extra spaces
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text


In [104]:
df1['clean_text'] = df1['Text'].apply(clean_text)
df1.head()

Unnamed: 0,Text,scam,clean_text
0,Reminder: Your appointment is scheduled for 12...,0,reminder your appointment is scheduled for at...
1,Refund of Rs 7500 is ready. Share your UPI PIN...,1,refund of rs is ready share your upi pin to re...
2,Thank you for your payment of Rs 999. Transact...,0,thank you for your payment of rs transaction id
3,Order #254668 has been shipped and will be del...,0,order has been shipped and will be delivered ...
4,Reminder: Your appointment is scheduled for 1-...,0,reminder your appointment is scheduled for at...


In [105]:
df2['clean_text'] = df2['Text'].apply(clean_text)
df2.head()

Unnamed: 0,Text,scam,clean_text
0,Your opinion about me? 1. Over 2. Jada 3. Kusr...,0,your opinion about me over jada kusruthi l...
1,What's up? Do you want me to come online? If y...,0,whats up do you want me to come online if you ...
2,So u workin overtime nigpun?,0,so u workin overtime nigpun
3,"Also sir, i sent you an email about how to log...",0,also sir i sent you an email about how to log ...
4,Please Stay At Home. To encourage the notion o...,1,please stay at home to encourage the notion of...


In [106]:
X = df["clean_text"]
y = df["scam"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [107]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,3),
     min_df=2,
    max_df=0.95,
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [108]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)


In [109]:
nb_preds = nb_model.predict(X_test_tfidf)

print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds))

Naive Bayes Accuracy: 0.9939562993956299
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1051
           1       0.99      1.00      0.99      1100

    accuracy                           0.99      2151
   macro avg       0.99      0.99      0.99      2151
weighted avg       0.99      0.99      0.99      2151



In [110]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

In [111]:
lr_preds = lr_model.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))
print(classification_report(y_test, lr_preds))

Logistic Regression Accuracy: 0.9925615992561599
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1051
           1       0.99      1.00      0.99      1100

    accuracy                           0.99      2151
   macro avg       0.99      0.99      0.99      2151
weighted avg       0.99      0.99      0.99      2151



In [112]:
svm_model = LinearSVC(class_weight={0: 1.5, 1: 1})
svm_model.fit(X_train_tfidf, y_train)

In [113]:
svm_preds = svm_model.predict(X_test_tfidf)

print("SVM Accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds))

SVM Accuracy: 0.9934913993491399
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1051
           1       0.99      1.00      0.99      1100

    accuracy                           0.99      2151
   macro avg       0.99      0.99      0.99      2151
weighted avg       0.99      0.99      0.99      2151



In [116]:
def predict_message(message):
    message = clean_text(message)
    vector = tfidf.transform([message])
    prediction = svm_model.predict(vector)[0]
    if "do not share" in message and "otp" in message:
        return "NOT SCAM ✅"

    return "SCAM 🚨" if prediction == 1 else "NOT SCAM ✅"

# Example
predict_message("do not share your otp")

'NOT SCAM ✅'