# EMAIL SPAM DETECTION USING LOGISTIC REGRESSION

### Importing necessary Libraries

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


### Load data

In [15]:
df = pd.read_csv("D:\AI Training Projects\Logistic Regression\spam mail.csv")
df.head()

Unnamed: 0,Category,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Categorize the dataset

In [16]:
df["Category"] = df["Category"].map({"spam":1, "ham":0})
df.head()

Unnamed: 0,Category,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Check columns

In [17]:
print(df.columns)

Index(['Category', 'Messages'], dtype='object')


### Extract Simple numeric features

In [18]:
df["length"] = df["Messages"].apply(len)
df["digits"] = df["Messages"].apply(lambda x: sum(c.isdigit() for c in x))
df["special"] = df["Messages"].apply(lambda x: sum(not c.isalnum() and not c.isspace() for c in x))

In [19]:
class_ = ["ham", "spam"]

### Label Encoding (Ham=0, spam=1)

In [20]:
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["Category"])

### Keep original messages for printing later

In [21]:
X_text = df["Messages"].copy()
X = df[["length", "digits", "special"]]
y = df["Category"]

In [22]:
X

Unnamed: 0,length,digits,special
0,111,0,9
1,29,0,6
2,155,25,6
3,49,0,6
4,61,0,2
...,...,...,...
5567,161,21,9
5568,37,0,2
5569,57,0,7
5570,125,0,1


### Train Test Split

In [23]:
X_train, X_test, y_train, y_test, X_train_text, X_test_text = train_test_split(
    X, y, X_text, test_size=0.20, random_state=42
)

### Scaling the features

In [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Train Logistic regression model

In [25]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [26]:
sample_email = X_test_scaled
sample_email

array([[ 1.23186777,  0.41854406, -0.48501146],
       [ 2.44262424, -0.37797638, -0.0653054 ],
       [ 0.36704172,  2.33019313, -0.27515843],
       ...,
       [-0.29022608, -0.37797638,  0.35440066],
       [-0.99938343, -0.37797638, -0.48501146],
       [-0.09996434,  1.05576042, -0.27515843]])

### Predict Sigmoid probabilities for all test samples

In [27]:
probabilities = model.predict_proba(sample_email)
probabilities



array([[0.985643  , 0.014357  ],
       [0.99207063, 0.00792937],
       [0.95307711, 0.04692289],
       ...,
       [0.99303713, 0.00696287],
       [0.99180245, 0.00819755],
       [0.97954468, 0.02045532]])

### User Input Email prediction using trained model

In [30]:
n_samples = 5

# Random indexes from y_test
random_idx = np.random.choice(len(y_test), n_samples, replace=False)

print(f"Categories: {class_}")
print("="*60)

for i, idx  in enumerate(random_idx):
    # If using softmax (2 outputs: ham, spam)
    prob = probabilities[idx]
    spam_prob = prob[1]   # P(y=1) = Spam

    prediction = "Spam" if spam_prob > 0.5 else "Ham"

    email_text = X_test_text.iloc[idx]   # X_test_text = original test messages
    
user_email = input("\nEnter an email text to classify: ")
user_email = str(user_email)

# Extract same numeric features used in training
user_length = len(user_email)
user_digits = sum(c.isdigit() for c in user_email)
user_special = sum(not c.isalnum() and not c.isspace() for c in user_email)

# Create a single-row dataframe for prediction
user_features = pd.DataFrame([{
    "length": user_length,
    "digits": user_digits,
    "special": user_special
}])

# Predict probability
user_prob = model.predict_proba(user_features)[0][1]   # P(spam)
user_pred = "Spam" if user_prob > 0.5 else "Ham"

print("\n--- USER EMAIL PREDICTION ---")
print(f"Email Entered:\n{user_email}\n")
print(f"Sigmoid Output = {user_prob:.4f} ({user_prob*100:.1f}%)")
print(f"Final Prediction: {user_pred}")
print("------------------------------------")


Categories: ['ham', 'spam']

--- USER EMAIL PREDICTION ---
Email Entered:
URGENT! You have won a 1 week FREE membership in our Ã¥Â£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18

Sigmoid Output = 0.9999 (100.0%)
Final Prediction: Spam
------------------------------------
