In [1]:
## Importing the necessary libraries
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv('mail_data.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
## Getting a brief overview of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
## Checking for duplicate
data.duplicated().sum()

415

In [6]:
## Droping the duplicated value
data=data.drop_duplicates()
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
## checking if our data is a bias dataset
data['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4516
spam,641


from the above we saw that this is not a balance dataset.

In [8]:
## checking the shape of my data
data.shape

(5157, 2)

In [9]:
## checking for the basic discription
data.describe()

Unnamed: 0,Category,Message
count,5157,5157
unique,2,5157
top,ham,"Go until jurong point, crazy.. Available only ..."
freq,4516,1


In [15]:
## Encoding the category using the loc function
data.loc[data['Category']=='spam','Category']=0
data.loc[data['Category']=='ham','Category']=1
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
## Checking For missing values
data.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [17]:
data.shape

(5157, 2)

In [18]:
## Dividing the data into X and y feature
X=data['Message']
y=data['Category']

In [19]:
X.head()

Unnamed: 0,Message
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
y

Unnamed: 0,Category
0,1
1,1
2,0
3,1
4,1
...,...
5567,0
5568,1
5569,1
5570,1


In [22]:
## Performing Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=3)

In [23]:
X_train.shape

(4125,)

In [24]:
X_test.shape

(1032,)

In [26]:
## Performing feature extration on the mail using Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True,binary=True)
X_train=feature_extraction.fit_transform(X_train)
X_test=feature_extraction.transform(X_test)

In [28]:
print(X_train)

  (0, 2400)	0.4689535663823655
  (0, 1247)	0.5538832733861689
  (0, 6605)	0.4898673616987752
  (0, 6692)	0.48303813512243965
  (1, 6492)	0.5755914257195885
  (1, 5859)	0.5964494866231046
  (1, 1592)	0.5594126567616489
  (2, 5786)	0.13538651756257114
  (2, 4038)	0.2515686205085572
  (2, 6925)	0.12194034445200479
  (2, 4089)	0.22565786708590121
  (2, 4522)	0.24277117206461765
  (2, 3695)	0.22157441644801484
  (2, 3252)	0.18672999852452019
  (2, 6828)	0.1422674550896824
  (2, 4084)	0.13584638268274823
  (2, 4519)	0.2515686205085572
  (2, 4520)	0.2515686205085572
  (2, 4393)	0.2515686205085572
  (2, 798)	0.2515686205085572
  (2, 2564)	0.20037766083141206
  (2, 25)	0.2515686205085572
  (2, 682)	0.2359473409111717
  (2, 7355)	0.22157441644801484
  (2, 5064)	0.2359473409111717
  :	:
  (4120, 2101)	0.3080717396234338
  (4120, 3921)	0.3922048767024034
  (4120, 3181)	0.42851037414347337
  (4120, 6456)	0.47518913626571924
  (4120, 1223)	0.5846719438819551
  (4121, 4064)	0.2824317115715428
  (4121

In [29]:
print(X_test)

  (0, 1518)	0.6739853501828288
  (0, 3886)	0.41026591111704
  (0, 4018)	0.33853958897969616
  (0, 4738)	0.32748577740962176
  (0, 6086)	0.3944231765529361
  (1, 874)	0.6325276576749704
  (1, 3080)	0.3405927969937548
  (1, 4277)	0.6049436351033556
  (1, 6643)	0.34343632198724217
  (2, 5662)	0.5865608773464819
  (2, 7259)	0.8099051408446086
  (3, 854)	1.0
  (4, 632)	0.2864977412765236
  (4, 987)	0.22221423049122346
  (4, 1743)	0.17630463702159754
  (4, 2010)	0.20870975045358797
  (4, 2112)	0.17439586309391203
  (4, 2629)	0.3006186339823807
  (4, 3329)	0.2764788084577557
  (4, 3393)	0.23044680218521785
  (4, 4720)	0.21556046313795804
  (4, 4927)	0.25698940150283794
  (4, 6255)	0.16472694735776627
  (4, 6556)	0.2140782647026487
  (4, 6668)	0.2864977412765236
  :	:
  (1027, 2508)	0.2411261781044666
  (1027, 2979)	0.3158063487602124
  (1027, 3684)	0.35331081549331816
  (1027, 4018)	0.16913023151968562
  (1027, 4453)	0.20496382340684327
  (1027, 4786)	0.33671482458181523
  (1027, 5753)	0.2799

In [31]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [35]:
## Initiating the logistic regression model
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,y_train)

In [36]:
## Predicting the X_train
predictin_train_data=model.predict(X_train)

In [37]:
## checking for the accuracy of my model
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
accuracy_score(y_train,predictin_train_data)

0.9587878787878787

In [40]:
## plot a confussion matrix with it
confusion_matrix(y_train,predictin_train_data)

array([[ 337,  165],
       [   5, 3618]])

In [41]:
## Classification report
print(classification_report(y_train,predictin_train_data))

              precision    recall  f1-score   support

           0       0.99      0.67      0.80       502
           1       0.96      1.00      0.98      3623

    accuracy                           0.96      4125
   macro avg       0.97      0.83      0.89      4125
weighted avg       0.96      0.96      0.96      4125



In [42]:
## Predicting the test data and cheking the accuracy
prediction_test_data=model.predict(X_test)
accuracy_score(y_test,prediction_test_data)

0.9563953488372093

In [43]:
## ploting a confussion matrix
confusion_matrix(y_test,prediction_test_data)

array([[ 94,  45],
       [  0, 893]])

In [44]:
## Ploting the classification report
print(classification_report(y_test,prediction_test_data))

              precision    recall  f1-score   support

           0       1.00      0.68      0.81       139
           1       0.95      1.00      0.98       893

    accuracy                           0.96      1032
   macro avg       0.98      0.84      0.89      1032
weighted avg       0.96      0.96      0.95      1032



after comparing the accuracy of the model on both the test and train data, i was able to conclude that its a generalized model without overfitting and underfitting

In [45]:
## testing my model
input_mail=["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]
input_data_feature=feature_extraction.transform(input_mail)
predict=model.predict(input_data_feature)
if predict==0:
  print("The mail is a spam")
else:
  print("The mail is a ham")

The mail is a spam


In [46]:
## lets pickle it
import pickle
pickle.dump(model,open('logistic_regression.pkl', "wb"))
pickle.dump(feature_extraction,open('feature_extraction.pkl', "wb"))