In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt_tab to /home/ravi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ravi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ravi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import pandas as pd

df = pd.read_csv(
    "../DataSets/SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "message"]
)

print(df.iloc[0,0])
print(df.iloc[1,0])
print(df.head(1))
print(df.shape)

# Encoding Labels
df["label"] = df["label"].map({"ham": 0, "spam": 1})

print("=============================")
print(df.iloc[0,0])
print(df.iloc[1,0])


2Ô∏è‚É£ Text Preprocessing (Cleaning + Tokenization)

We‚Äôll do:

    lowercase
    remove URLs, numbers, punctuation
    tokenize
    remove stopwords
    lemmatize

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
print(f"stops words len: {len(stop_words)}")
print(stop_words)
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)     # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)           # remove punctuation & numbers
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) 
              for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["clean_message"] = df["message"].apply(clean_text)

#df.to_csv('../DataSets/SMSSpamCollection.csv')




stops words len: 198
{'can', 'my', 'have', 'our', 'up', 'again', 'too', 'there', "he'd", 'ma', "they'd", 'being', 'been', 'yours', 'themselves', 'ain', 'wasn', 'until', 'wouldn', 'some', 'through', "we'd", 'few', 'how', "wouldn't", 'or', 'about', 'll', "i'd", "it'd", 'has', 'hers', 'very', 'when', 'it', "we're", 'an', 'won', 'your', 'i', 'same', 'the', 'these', "we'll", 'mightn', 'ours', "she'll", 'than', 'out', 'just', 'while', 'shan', "i've", "they'll", 'both', 'haven', 'o', 'no', 'she', 'why', 'those', 'who', 'of', "needn't", 'nor', 'hasn', 'not', 'other', 'any', 'only', 'y', 'such', 'here', 'what', "you'd", 'does', 'they', 'to', 'in', 'himself', "i'm", 'between', "couldn't", 'doesn', 'aren', "i'll", 'before', 'was', "haven't", 'below', 'them', 'all', 'further', 'her', 'its', 'itself', 'should', 'this', 'at', 'be', 'mustn', 'needn', "you're", 'doing', "she's", "mustn't", 'shouldn', 'against', 'into', "won't", 'as', 'did', 'had', "he's", 'don', 'but', 'most', 'myself', 're', 'under',

[nltk_data] Downloading package stopwords to /home/ravi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ravi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [114]:

X = df["clean_message"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
) 


5Ô∏è‚É£ Feature Extraction (TF-IDF)

In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

6Ô∏è‚É£ Naive Bayes Spam Classifier ‚úÖ

In [116]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("==============================")
print(confusion_matrix(y_test, y_pred))
print("==============================")
print(classification_report(y_test, y_pred))


Accuracy: 0.9713004484304932
[[966   0]
 [ 32 117]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



Logistic Regression

In [117]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train)

print("LR Accuracy:", accuracy_score(y_test, lr.predict(X_test_vec)))



LR Accuracy: 0.9730941704035875


Support Vector Machine

In [118]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_vec, y_train)

print("SVM Accuracy:", accuracy_score(y_test, svm.predict(X_test_vec)))


SVM Accuracy: 0.9829596412556054


In [121]:
def predict_sms(text):
    clean = clean_text(text)
    vec = vectorizer.transform([clean])
    return "SPAM üö®" if model.predict(vec)[0] == 1 else "HAM ‚úÖ"

def predict_sms_from_svm(text):
    clean = clean_text(text)
    vec = vectorizer.transform([clean])
    return "SPAM üö®" if svm.predict(vec)[0] == 1 else "HAM ‚úÖ"

def predict_sms_from_lr(text):
    clean = clean_text(text)
    vec = vectorizer.transform([clean])
    return "SPAM üö®" if lr.predict(vec)[0] == 1 else "HAM ‚úÖ"

In [122]:

print(predict_sms("Congratulations! You won a free mobile recharge"))
print(predict_sms( "Hey, are we still on for dinner tonight?"))
print(predict_sms("WINNER!! As a valued network customer you have been selected to receivea ¬£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."))

print("=================================================")
print(predict_sms_from_svm("Congratulations! You won a free mobile recharge"))
print(predict_sms_from_svm( "Hey, are we still on for dinner tonight?"))
print(predict_sms_from_svm("WINNER!! As a valued network customer you have been selected to receivea ¬£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."))

print("=================================================")
print(predict_sms_from_lr("Congratulations! You won a free mobile recharge"))
print(predict_sms_from_lr( "Hey, are we still on for dinner tonight?"))
print(predict_sms_from_lr("WINNER!! As a valued network customer you have been selected to receivea ¬£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."))




SPAM üö®
HAM ‚úÖ
SPAM üö®
SPAM üö®
HAM ‚úÖ
SPAM üö®
SPAM üö®
HAM ‚úÖ
SPAM üö®
