**SPAM EMAIL CLASSIFICATION MODEL**

**Data Importation and Exploration**

In [27]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
import joblib
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import nltk
from sklearn.model_selection import train_test_split
import imblearn
import sklearn

In [2]:
df=pd.read_csv("/content/data/Email_spam.csv")
df.sample(4)

Unnamed: 0,Category,Message
822,ham,On the road so cant txt
4708,ham,Wif my family booking tour package.
5377,spam,The current leading bid is 151. To pause this ...
4449,ham,I sent them. Do you like?


In [3]:
df.isna().any().any()

False

In [4]:
df.shape

(5572, 2)

In [5]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [6]:
df["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
df["spam"] = df["Category"].apply(lambda x:1 if x=="spam" else 0)
df.sample(4)

Unnamed: 0,Category,Message,spam
5570,ham,The guy did some bitching but I acted like i'd...,0
2028,ham,No got new job at bar in airport on satsgettin...,0
1593,ham,Will it help if we propose going back again to...,0
242,ham,PLEASSSSSSSEEEEEE TEL ME V AVENT DONE SPORTSx,0


In [8]:
nltk.download("punkt")
def preprocess_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Reconstruct the text without removing stopwords or symbols
    processed_text = ' '.join(tokens)

    return processed_text

# Apply preprocessing function to the text data
df["cleaned_df"] = df["Message"].apply(preprocess_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
df.head(2)

Unnamed: 0,Category,Message,spam,cleaned_df
0,ham,"Go until jurong point, crazy.. Available only ...",0,"Go until jurong point , crazy .. Available onl..."
1,ham,Ok lar... Joking wif u oni...,0,Ok lar ... Joking wif u oni ...


**SAMPLING OUR IMBALANCED DATASET**

In [10]:
x_df=df["cleaned_df"]
x_df.head(1)

0    Go until jurong point , crazy .. Available onl...
Name: cleaned_df, dtype: object

In [11]:
y_df = df.spam
y_df.head(1)

0    0
Name: spam, dtype: int64

**Data Splitting**

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x_df, y_df,test_size= 0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


**Feature Extraction using TF-IDF**

In [13]:
vectorizer = TfidfVectorizer(stop_words=None,)
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [14]:
x_train.head(5)

1978    Reply to win Â£100 weekly ! Where will the 2006...
3989    Hello . Sort of out in town already . That . S...
3935    How come guoyang go n tell her ? Then u told h...
4078    Hey sathya till now we dint meet not even a si...
4086    Orange brings you ringtones from all time Char...
Name: cleaned_df, dtype: object

**Balancing my Target Variable**

In [15]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to balance the training dataset
X_train_resampled, y_train_resampled = smote.fit_resample(x_train_tfidf, y_train)

# Check the balance
print(y_train_resampled.value_counts())


spam
1    3859
0    3859
Name: count, dtype: int64


**Training my model by stacking two algorithms (svm and randomforestclassifier)**

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Define base models
estimators = [
    ('svm', SVC(kernel='rbf', class_weight={0: 1, 1: 2}, probability=True, random_state=42)),
    ('rf', RandomForestClassifier(class_weight={0: 1, 1: 2}, random_state=42))
]

# Define stacking classifier
stacking_model = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(), cv=5
)

# Train the model
stacking_model.fit(x_train_tfidf, y_train)

# Save the model and the vectorizer
joblib.dump(stacking_model, 'Rebranded_spam_email_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

**Model Evaluation**

In [None]:
# Predict on the test set
y_pred_stacking = stacking_model.predict(x_test_tfidf)

# Evaluate the model
conf_matrix_stacking = confusion_matrix(y_test, y_pred_stacking)
class_report_stacking = classification_report(y_test, y_pred_stacking)

print("Confusion Matrix with Stacking Classifier:")
print(conf_matrix_stacking)
print("\nClassification Report with Stacking Classifier:")
print(class_report_stacking)

**Testing my model on test_data & My Classification Report & Confusion matrix**

In [17]:
# Predict on the test set
y_pred_stacking = stacking_model.predict(x_test_tfidf)

# Evaluate the model
conf_matrix_stacking = confusion_matrix(y_test, y_pred_stacking)
class_report_stacking = classification_report(y_test, y_pred_stacking)

print("Confusion Matrix with Stacking Classifier:")
print(conf_matrix_stacking)
print("\nClassification Report with Stacking Classifier:")
print(class_report_stacking)


Confusion Matrix with Stacking Classifier:
[[964   2]
 [  7 142]]

Classification Report with Stacking Classifier:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       0.99      0.95      0.97       149

    accuracy                           0.99      1115
   macro avg       0.99      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115



**Testing my Trained Model on a random data input(email)**

In [18]:
# Sample email to test
sample_email = "Congratulations! You've won a $1000 gift card. Click here to claim your prize."

# Preprocess the sample email
sample_email_tfidf = vectorizer.transform([sample_email])

# Predict using the trained stacking classifier
sample_email_prediction = stacking_model.predict(sample_email_tfidf)

# Print the prediction result
if sample_email_prediction[0] == 1:
    print("The email is classified as: Spam")
else:
    print("The email is classified as: Not Spam")


joblib.dump(stacking_model, 'final_spam email classifier.h5')



The email is classified as: Spam


['final_spam email classifier.h5']

In [19]:
loaded_model=joblib.load("final_spam email classifier.h5")

In [20]:
loaded_model.predict(sample_email_tfidf)

array([1])

In [28]:
print("pandas==", pd.__version__)
print("imbalanced-learn==", imblearn.__version__)
print("scikit-learn==", sklearn.__version__)
print("numpy==", np.__version__)
print("joblib==", joblib.__version__)
print("nltk==", nltk.__version__)

pandas== 2.0.3
imbalanced-learn== 0.10.1
scikit-learn== 1.2.2
numpy== 1.25.2
joblib== 1.4.2
nltk== 3.8.1
