In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = pd.read_csv('mail_spam_classifier.csv')
df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [None]:
df.head(6)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
df.shape

(5572, 2)

In [None]:
df['Category'] = np.where(df['Category']=='spam',0,1)

In [None]:
x=df['Message']
y=df['Category']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [None]:
models = {
    "LogisticRegression":LogisticRegression(),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "SVC":SVC()
}

In [None]:
report={}

for modelname in models.keys():
    print(modelname)
    model = models[modelname]
    model.fit(x_train_features,y_train)
    y_train_pred = model.predict(x_train_features)
    y_test_pred = model.predict(x_test_features)
    train_accuracy_score = accuracy_score(y_train,y_train_pred)
    test_accuracy_score = accuracy_score(y_test,y_test_pred)
    print("train_accuracy_score",train_accuracy_score)
    print("test_accuracy_score",test_accuracy_score)
    report[modelname] = test_accuracy_score
    print(confusion_matrix(y_test,y_test_pred))
    print(classification_report(y_test,y_test_pred))


LogisticRegression
train_accuracy_score 0.9670181736594121
test_accuracy_score 0.967713004484305
[[113  36]
 [  0 966]]
              precision    recall  f1-score   support

           0       1.00      0.76      0.86       149
           1       0.96      1.00      0.98       966

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

DecisionTreeClassifier
train_accuracy_score 1.0
test_accuracy_score 0.967713004484305
[[125  24]
 [ 12 954]]
              precision    recall  f1-score   support

           0       0.91      0.84      0.87       149
           1       0.98      0.99      0.98       966

    accuracy                           0.97      1115
   macro avg       0.94      0.91      0.93      1115
weighted avg       0.97      0.97      0.97      1115

RandomForestClassifier
train_accuracy_score 1.0
test_accuracy_score 0.979372197309417
[[126  23]
 [  0 966]]
        

In [None]:
best_score = max(list(report.values()))
best_model_name = list(report.keys())[list(report.values()).index(best_score)]
print(best_model_name)
print(best_score)
model = models[best_model_name].fit(x_train_features,y_train)

SVC
0.9847533632286996


In [None]:
message = [str(input("enter email: "))]
input_message_feature = feature_extraction.transform(message)
prediction = model.predict(input_message_feature)
print("Not spam") if prediction[0]==1 else print("Spam")

enter email: hhh
Not spam
