In [22]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv("spam_ham_dataset.csv")

In [23]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [24]:
df = df[['text', 'label_num']]  

In [25]:
df

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...
5166,Subject: put the 10 on the ft\r\nthe transport...,0
5167,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,Subject: industrial worksheets for august 2000...,0


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_num'], test_size=0.2, random_state=42
)

In [27]:
vectorizer = CountVectorizer(stop_words='english', max_features=5000)

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [28]:
model = LogisticRegression(max_iter=1000, C=0.1, solver='liblinear')
model.fit(X_train_bow, y_train)
y_pred = model.predict(X_test_bow)

In [29]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       742
           1       0.94      0.98      0.96       293

    accuracy                           0.98      1035
   macro avg       0.96      0.98      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [30]:
# TF-IDF Weighting

df = pd.read_csv("spam_ham_dataset.csv")
df = df[['text', 'label_num']]  

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_num'], test_size=0.2, random_state=42
)

# BoW features
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train model
model = LogisticRegression(max_iter=1000, C=0.1, solver='liblinear')
model.fit(X_train_bow, y_train)

# Evaluate
y_pred = model.predict(X_test_bow)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       742
           1       0.94      0.98      0.96       293

    accuracy                           0.98      1035
   macro avg       0.96      0.98      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [31]:
index = 384
sample_message = df.loc[index, 'text']
actual_label = df.loc[index, 'label_num']

In [32]:
sample_message

'Subject: re : txu may 01\r\nthis deal was not transported on contract 1856 , the 15 , 000 was sold to el paso merchant directly at that point . it was 15 , 000 a day for the 30 th and 31 st .\r\nlet me know if you need anything further .\r\nmike\r\n- - - - - original message - - - - -\r\nfrom : beale , antoinette\r\nsent : wednesday , october 03 , 2001 12 : 48 pm\r\nto : olsen , michael\r\ncc : parker , megan ; farmer , daren j .\r\nsubject : txu may 01\r\nmike ,\r\ni have the following outstanding issue with txu :\r\nmay 01 - contract 1856\r\na path is needed on delivery 17054701 for 19087 mmbtus with the receipt point 175054701 .\r\nthanks !'

In [33]:
actual_label

0

In [34]:
sample_transformed = vectorizer.transform([sample_message])

predicted_label = model.predict(sample_transformed)[0]

print(f"Original Message: {sample_message}")
print(f"Actual Category: {actual_label}")
print(f"Predicted Category: {predicted_label}")

if predicted_label == actual_label:
    print("correct!")
else:
    print("incorrect.")

Original Message: Subject: re : txu may 01
this deal was not transported on contract 1856 , the 15 , 000 was sold to el paso merchant directly at that point . it was 15 , 000 a day for the 30 th and 31 st .
let me know if you need anything further .
mike
- - - - - original message - - - - -
from : beale , antoinette
sent : wednesday , october 03 , 2001 12 : 48 pm
to : olsen , michael
cc : parker , megan ; farmer , daren j .
subject : txu may 01
mike ,
i have the following outstanding issue with txu :
may 01 - contract 1856
a path is needed on delivery 17054701 for 19087 mmbtus with the receipt point 175054701 .
thanks !
Actual Category: 0
Predicted Category: 0
correct!
