### Email Spam Detection

### Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


### Creating a dataset 

In [6]:

data = {
    'sender': [
        'boss@company.com', 'lottery@winner.xyz', 'hr@company.com',
        'prince@nigeria-bank.net', 'support@google.com', 'free-btc@crypto.net',
        'mom@gmail.com', 'marketing@spam-service.com'
    ],
    'subject': [
        'Meeting update', 'YOU WON $1,000,000', 'Policy Change',
        'URGENT TRANSFER', 'Security Alert', 'FREE BITCOIN NOW',
        'Hello dear', 'CLICK HERE FOR PRIZES'
    ],
    'body': [
        'Please attend the meeting at 3 PM today.',
        'Click this link to claim your prize money now!',
        'Please review the attached employee policy.',
        'I need your bank account for urgent transfer.',
        'Someone tried to access your account.',
        'Get rich quick with this crypto scheme.',
        'How are you doing? Let us have lunch.',
        'You have been selected as a winner.'
    ],
    'label': [0, 1, 0, 1, 0, 1, 0, 1]
    # 0 = Ham (Safe), 1 = Spam
}

df = pd.DataFrame(data)
print(f"Dataset created with {len(df)} emails.\n")
print("The dataframe is \n",df)







Dataset created with 8 emails.

The dataframe is 
                        sender                subject  \
0            boss@company.com         Meeting update   
1          lottery@winner.xyz     YOU WON $1,000,000   
2              hr@company.com          Policy Change   
3     prince@nigeria-bank.net        URGENT TRANSFER   
4          support@google.com         Security Alert   
5         free-btc@crypto.net       FREE BITCOIN NOW   
6               mom@gmail.com             Hello dear   
7  marketing@spam-service.com  CLICK HERE FOR PRIZES   

                                             body  label  
0        Please attend the meeting at 3 PM today.      0  
1  Click this link to claim your prize money now!      1  
2     Please review the attached employee policy.      0  
3   I need your bank account for urgent transfer.      1  
4           Someone tried to access your account.      0  
5         Get rich quick with this crypto scheme.      1  
6           How are you doing? 

### Data  Preprocessing

In [7]:
# Metadata features
# Heuristic: Spam emails often have longer, weirder email addresses.
df['sender_length'] = df['sender'].apply(len)

# pattern features
# Heuristic: Spammers often use ALL CAPS in subjects.
def count_caps(text):
    return sum(1 for char in text if char.isupper())

df['subject_caps_count'] = df['subject'].apply(count_caps)

#keyword feature
# We use CountVectorizer to turn words into numbers (Bag of Words).
# We only keep the top 10 most common words to keep this example simple.
vectorizer = CountVectorizer(stop_words='english', max_features=10)
text_features = vectorizer.fit_transform(df['body']).toarray()

# Convert text features to a DataFrame so we can merge them
text_df = pd.DataFrame(text_features, columns=vectorizer.get_feature_names_out())

#merging all features
# Combine Metadata + Patterns + Keywords into one big table (X)
X = pd.concat([df[['sender_length', 'subject_caps_count']], text_df], axis=1)
y = df['label']

print("Features Prepared. Here is a snippet of the data the model sees:")
print(X.head(3))

Features Prepared. Here is a snippet of the data the model sees:
   sender_length  subject_caps_count  access  account  attached  attend  bank  \
0             16                   1       0        0         0       1     0   
1             18                   6       0        0         0       0     0   
2             14                   2       0        0         1       0     0   

   claim  click  crypto  doing  employee  
0      0      0       0      0         0  
1      1      1       0      0         0  
2      0      0       0      0         1  


### Data Splitting

In [8]:
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Data Size: {len(X_train)}")
print(f"Testing Data Size: {len(X_test)}\n")

Training Data Size: 6
Testing Data Size: 2



### Model Training

In [9]:
# We use Random Forest because it handles mixed data (numbers + counts) very well
model = RandomForestClassifier(n_estimators=100, random_state=42)

# The model learns the relationship between X (features) and y (labels)
model.fit(X_train, y_train)
print("Model trained successfully.\n")

Model trained successfully.



### Model Evaluation

In [None]:
# Predict on the test set
predictions = model.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

### Prediction on new data

In [14]:
def predict_spam(sender, subject, body):
    #  Extract Metadata Feature
    f_sender_len = len(sender)

    # Extract Pattern Feature
    f_subject_caps = count_caps(subject)

    # Extract Keyword Features
    f_text = vectorizer.transform([body]).toarray()
    f_text_df = pd.DataFrame(f_text, columns=vectorizer.get_feature_names_out())

    # Combine
    input_data = pd.DataFrame([[f_sender_len, f_subject_caps]], columns=['sender_length', 'subject_caps_count'])
    final_input = pd.concat([input_data, f_text_df], axis=1)

    #Predict
    prediction = model.predict(final_input)[0]
    probability = model.predict_proba(final_input)[0][1] # Probability of being spam

    result = "SPAM" if prediction == 1 else "HAM (Safe)"
    print(f"Email: '{subject}'\nVerdict: {result} (Confidence: {probability:.2f})")

#testing
# Case 1: Likely Spam (Caps lock subject, 'prize' keyword, weird sender)
predict_spam("unknown@weird-domain.net", "YOU HAVE WON A PRIZE", "Click here to claim your prize money.")

# Case 2: Likely Safe (Normal casing, 'meeting' keyword, short sender)
predict_spam("bob@work.com", "Meeting notes", "Here are the notes from the meeting.")
#Case 3:
predict_spam("babin@gmail.com","shop","Things we have to buy the following things ")
#case 4:
predict_spam("jelekhe@gmail.com","Won","You have a flat 10 % percent send the money to claim it free")

Email: 'YOU HAVE WON A PRIZE'
Verdict: SPAM (Confidence: 0.82)
Email: 'Meeting notes'
Verdict: HAM (Safe) (Confidence: 0.09)
Email: 'shop'
Verdict: HAM (Safe) (Confidence: 0.09)
Email: 'Won'
Verdict: HAM (Safe) (Confidence: 0.09)
