In [1]:
# !pip install pandas nltk scikit-learn

import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print("Downloading NLTK resources...")
nltk.download('stopwords')
nltk.download('wordnet')
print("Downloads complete.")

Downloading NLTK resources...
Downloads complete.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tarru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [3]:
url = 'https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv'

df = pd.read_csv(url, encoding='latin-1')

df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.columns = ['label', 'text']

print("--- First 5 rows of the dataset ---")
df.head()

--- First 5 rows of the dataset ---


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
def preprocess_text(text):

    text = re.sub('[^a-zA-Z]', ' ', text).lower()

    tokens = text.split()
    
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return " ".join(clean_tokens)

df['processed_text'] = df['text'].apply(preprocess_text)

print("--- Dataset after Preprocessing ---")
df[['label', 'processed_text']].head()

--- Dataset after Preprocessing ---


Unnamed: 0,label,processed_text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


In [5]:
X = df['processed_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train)

X_test_bow = vectorizer.transform(X_test)

print(f"Training data shape: {X_train_bow.shape}")
print(f"Testing data shape: {X_test_bow.shape}")

Training data shape: (4457, 6267)
Testing data shape: (1115, 6267)


In [6]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_bow, y_train)

print("Model training complete.")

Model training complete.


In [7]:
y_pred = classifier.predict(X_test_bow)

accuracy = accuracy_score(y_test, y_pred)
print(f"--- Model Accuracy ---\n{accuracy:.4f}\n")

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=['Actual Ham', 'Actual Spam'], columns=['Predicted Ham', 'Predicted Spam'])
print("--- Confusion Matrix ---")
print(df_cm)

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

--- Model Accuracy ---
0.9776

--- Confusion Matrix ---
             Predicted Ham  Predicted Spam
Actual Ham             965               0
Actual Spam             25             125

--- Classification Report ---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

