In [23]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib


In [24]:
df = pd.read_csv("spam.csv", encoding="latin-1")
df = df.iloc[:, :2]
df.columns = ["label", "text"]
df["label"] = df["label"].astype(str).str.strip().str.lower()
df["label"] = df["label"].str.replace(r'[^a-zA-Z]', '', regex=True)
df["label"] = df["label"].map({"ham": 0, "spam": 1})
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)
df.head(), df["label"].unique()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   int64 
 1   text    5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


Cleaning 

In [25]:
def clean_txt(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean"] = df["text"].astype(str).apply(clean_txt)
df.head()


Unnamed: 0,label,text,clean
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives arou...


Split

In [26]:

X_train, X_test, y_train, y_test = train_test_split(
    df["clean"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

Tokenization / Naive Bayes Classifier

In [27]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2), min_df=2)),
    ("nb", MultinomialNB())
])

model.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('nb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


Prediction / Accuracy

In [28]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9605381165919282

Confusion Matrix:
 [[966   0]
 [ 44 105]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.70      0.83       149

    accuracy                           0.96      1115
   macro avg       0.98      0.85      0.90      1115
weighted avg       0.96      0.96      0.96      1115



In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred)
    }
    return metrics

metrics = evaluate_model(model, X_test, y_test)
print(metrics)



{'accuracy': 0.9605381165919282, 'precision': 1.0, 'recall': 0.7046979865771812, 'f1_score': 0.8267716535433071}


In [30]:
proba = model.predict_proba(X_test)
def predict_message(message):
    cleaned = clean_txt(message)
    pred = model.predict([cleaned])[0]
    proba = model.predict_proba([cleaned])[0]

    return {
        "prediction": "Spam" if pred == 1 else "Ham",
        "spam_probability": round(float(proba[1]), 4),
        "ham_probability": round(float(proba[0]), 4)
    }


Checking

In [31]:
def check():
    message = input("Enter the email: ")
    cleaned = clean_txt(message)

    proba = model.predict_proba([cleaned])[0]
    spam_prob = proba[1] * 100
    ham_prob = proba[0] * 100

    if spam_prob > 60:
        label = "Spam üö®"
    elif spam_prob > 30:
        label = "Suspicious ‚ö†Ô∏è"
    else:
        label = "Ham ‚úÖ"

    print(f"{label}")
    print(f"Spam Probability: {spam_prob:.2f}%")
    print(f"Ham Probability: {ham_prob:.2f}%")


In [32]:
check()


Ham ‚úÖ
Spam Probability: 5.50%
Ham Probability: 94.50%


In [33]:
joblib.dump({
    "model": model,
    "metrics": metrics
}, "spam_model.pkl")


['spam_model.pkl']