In [1]:
import pandas as pd

df = pd.read_csv("spam.csv", encoding="latin-1")
print(df.shape)
df = df.drop_duplicates()
print(df.shape)

df = df[['v1', 'v2']]  
df = df.iloc[:, :2]


df.columns = ['label', 'text']
print(df.isnull().sum())
print(df.shape)

df.head()

(5572, 5)
(5169, 5)
label    0
text     0
dtype: int64
(5169, 2)


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:

df['label_num'] = df['label'].map({'ham':0, 'spam':1})

df.head()

Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
extra_spam = [
    "Your parcel cannot be delivered because of missing information; update your address using the website in this message",
    "A package in your name is waiting but delivery fees are pending; pay now through this link to avoid return",
    "Your bank account has unusual activity; verify your details immediately using this link"
]

extra_ham = [
    "Your package from Amazon will arrive tomorrow",
    "Your bank statement is ready in the mobile app",
    "Your food delivery is on the way"
]
extra_ham += [
    "Your bank has updated its mobile app; you can read more about the changes on the official website or in the branch.",
    "We improved the mobile banking experience; feel free to update the app from your usual app store whenever convenient.",
    "Your bank has released new features in the mobile app; check the news section in your account for details, no action required.",
    "Your monthly statement is available in the mobile app; open it when you have time to review your recent transactions.",
    "Security tips: Your bank reminds you to keep your app and phone updated; never click on links from unknown messages.",
     "Thanks for using our service; your feedback helps us improve. You can rate us inside your usual app.",
     "As a member, you sometimes receive extra training sessions; details are in your gym account under offers.",
    "Hey can I call you?"
]

extra_spam += [
    "Delivery fee is still pending for your package; complete the payment using this link now",
    "Your shipment is on hold due to unpaid charges; pay the outstanding balance via the website below",
    "We could not deliver your parcel today; confirm your payment details on the secure link in this message"
]
extra_spam += [
    "A relative is in an accident and needs money urgently; transfer funds to the account in this message right away.",
    "Someone close to you is in legal trouble and requires immediate bail; send a payment using the listed method now.",
    "Your son has been arrested and needs money immediately; wire the funds to the account in this text.",
    "Your daughter is in the hospital and treatment will stop without payment; send money now using this link.",
    "A family member is in serious danger and needs emergency funds; respond with a transfer as soon as possible."
]
extra_spam += [
    "Someone close to you is in legal trouble and requires immediate bail; send a payment using the listed method now.",
    "Your brother has been arrested and urgently needs bail money; transfer the funds to the account in this message.",
    "Your cousin is being held and cannot be released without bail; pay the requested amount now via this link.",
    "A family member is facing court today and needs immediate bail payment; send money using the details in this text."
    
]

extra_ham += [
    "Your bank has updated its mobile app; you can read more about the changes on the official website or in the branch.",
    "We improved the mobile banking experience; feel free to update the app from your usual app store whenever convenient.",
    "Your bank has released new features in the mobile app; check the news section in your account for details, no action required.",
    "Thanks for using our service; your feedback helps us improve. You can rate us inside your usual app.",

"As a member, you sometimes receive extra training sessions; details are in your gym account under offers."
    
]
extra_spam += [
    "We saw your profile and would like to offer you a flexible work-from-home position; confirm your interest and pay the registration fee on our portal.",
    "You have been selected for a high-income remote job; complete the registration fee payment today to start.",
    "No experience needed: start earning from home now; pay the small signup fee using this link to begin.",
    "Your application for a remote position is pre-approved; secure your spot by paying the processing fee online."
]


extra = pd.DataFrame({
    "label": ["spam"] * len(extra_spam) + ["ham"] * len(extra_ham),
    "text": extra_spam + extra_ham
})

df2 = pd.concat([df, extra], ignore_index=True)
df2["label_num"] = df2["label"].map({"ham": 0, "spam": 1})
df2.tail()


Unnamed: 0,label,text,label_num
5199,ham,Your bank has updated its mobile app; you can ...,0
5200,ham,We improved the mobile banking experience; fee...,0
5201,ham,Your bank has released new features in the mob...,0
5202,ham,Thanks for using our service; your feedback he...,0
5203,ham,"As a member, you sometimes receive extra train...",0


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df2['text'], df2['label_num'], test_size=0.2, random_state=42
)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,3), min_df=2)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=2000, class_weight="balanced")
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9779058597502401
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       890
           1       0.92      0.93      0.92       151

    accuracy                           0.98      1041
   macro avg       0.95      0.96      0.96      1041
weighted avg       0.98      0.98      0.98      1041



In [6]:
test_messages = [
    "A relative is in an accident and needs money urgently; transfer funds to the account in this message right away.",
    "Someone close to you is in legal trouble and requires immediate bail; send a payment using the listed method now.",
    "Win a free iPhone now!",
     "Your bank has updated its mobile app; you can read more about the changes on the official website or in the branch.",
    "We improved the mobile banking experience; feel free to update the app from your usual app store whenever convenient.",
    "Your bank has released new features in the mobile app; check the news section in your account for details, no action required.",
    "Thanks for using our service; your feedback helps us improve. You can rate us inside your usual app.",

    "As a member, you sometimes receive extra training sessions; details are in your gym account under offers.",
    "Hey can I call you?",
    "We saw your profile and would like to offer you a flexible work-from-home position; confirm your interest and pay the registration fee on our portal.",
    "You have been selected for a high-income remote job; complete the registration fee payment today to start.",
    "No experience needed: start earning from home now; pay the small signup fee using this link to begin.",
    "Your application for a remote position is pre-approved; secure your spot by paying the processing fee online."
]

for msg in test_messages:
    vec = vectorizer.transform([msg])
    prob = model.predict_proba(vec)[0][1] 
    pred = 1 if prob >= 0.45 else 0       
    print(msg, "→", "SPAM" if pred == 1 else "NOT spam", f"(p_spam={prob:.2f})")


A relative is in an accident and needs money urgently; transfer funds to the account in this message right away. → SPAM (p_spam=0.69)
Someone close to you is in legal trouble and requires immediate bail; send a payment using the listed method now. → SPAM (p_spam=0.77)
Win a free iPhone now! → SPAM (p_spam=0.85)
Your bank has updated its mobile app; you can read more about the changes on the official website or in the branch. → NOT spam (p_spam=0.39)
We improved the mobile banking experience; feel free to update the app from your usual app store whenever convenient. → SPAM (p_spam=0.51)
Your bank has released new features in the mobile app; check the news section in your account for details, no action required. → SPAM (p_spam=0.60)
Thanks for using our service; your feedback helps us improve. You can rate us inside your usual app. → NOT spam (p_spam=0.45)
As a member, you sometimes receive extra training sessions; details are in your gym account under offers. → NOT spam (p_spam=0.29)
He

In [7]:
import pickle


with open("spam_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
