In [107]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [108]:
df = pd.read_csv('mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [109]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [110]:
df.isnull().any()

Category    False
Message     False
dtype: bool

In [20]:
# Stemming the content of each mail
stemmer = PorterStemmer()
stop_words =  set(stopwords.words('english'))

def stemming(text):
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    words = [stemmer.stem(t) for t in text.split() if t not in stop_words]
    return ' '.join(words)

In [22]:
df['Preprocessed_Msg'] = df['Message'].apply(stemming)
df['Preprocessed_Msg'].head()

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri wkli comp win fa cup final tkt st m...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: Preprocessed_Msg, dtype: object

In [111]:
X = df['Message']

In [112]:
# Encoding labels
encoder = LabelEncoder()
y = encoder.fit_transform(df['Category'])

In [113]:
# Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [118]:
# Vectorizing the texts
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [132]:
# Balancing training data
sm = SMOTE(random_state=42)

X_train_bal, y_train_bal = sm.fit_resample(X_train_features, y_train)

In [133]:
# Model training
model = LogisticRegression()
model.fit(X_train_bal, y_train_bal)

In [134]:
y_pred = model.predict(X_test_features)

In [135]:
class_rep = classification_report(y_test, y_pred)
acc_scr = accuracy_score(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)

print(f'Classification Report: \n{class_rep}')
print(f'Accuracy Score: {acc_scr}')
print(f'Confusion Matrix: \n{conf_mat}')

Classification Report: 
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.94      0.90      0.92       149

    accuracy                           0.98      1115
   macro avg       0.96      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Accuracy Score: 0.97847533632287
Confusion Matrix: 
[[957   9]
 [ 15 134]]


In [138]:
# Building the Prediction System
input_data = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
# stemmed_input_data = stemming(input_data)
input_X_test = vectorizer.transform([input_data])
input_y_pred = model.predict(input_X_test)
if input_y_pred[0] == 0:
    print('You received a mail. Check it')
else:
    print('SPAM mail... Moved safely to spam folder.')

SPAM mail... Moved safely to spam folder.
