1. Importing library 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer  #convert text --> feature vectors(numeric values)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

2. load Dataset

In [2]:
raw_mail_data = pd.read_csv("mail_data.csv")

In [3]:
raw_mail_data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


3. Analyses 

In [4]:
raw_mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
#replace the null values with a null string
mail_data = raw_mail_data.where(pd.notnull(raw_mail_data), '')

In [6]:
# checking the number of raws and columns in the dataframe
mail_data.shape

(5572, 2)

4. Lable encoding of categorical column

In [7]:
# label spam mail as 0; ham mail as 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0

mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

spam --> 0

ham  --> 1

In [8]:
mail_data.head(10)

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
5,0,FreeMsg Hey there darling it's been 3 week's n...
6,1,Even my brother is not like to speak with me. ...
7,1,As per your request 'Melle Melle (Oru Minnamin...
8,0,WINNER!! As a valued network customer you have...
9,0,Had your mobile 11 months or more? U R entitle...


5. Defining dependendent and independent variable

In [9]:
# x = message
# y = category -->(spam,ham)

x = mail_data['Message']

y = mail_data['Category']

In [10]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [11]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


6. split x and y into train and test dataset

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

7. Feature extraction

In [13]:
# transforming the text data to feature vectors that can be used as input to the Logistic Regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = 'True')

# if any word is repeat again and again it give some score like free, offer, discount and according to socre it put mail into Category of spam and ham
# min_df = 1   --> means if score of that word is less then 1 then we are ignore that word
# stop_word = 'english'   --> words like (is,not,did,ok) is ignore by TfidfVectorizer 
# lowercase = 'Truse'   --> means all the word are converting into lowercase

In [14]:
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

In [15]:
# convert y_train and y_test values as integers bcoz we want 0 for spam and 1 for ham

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [16]:
print(x_train_features)

  (0, 256)	0.27634309717490263
  (0, 3324)	0.1755046373513671
  (0, 319)	0.17826547581136218
  (0, 6878)	0.1678999073263407
  (0, 2975)	0.1962026716125243
  (0, 4519)	0.27634309717490263
  (0, 4067)	0.2634778408902618
  (0, 7035)	0.27634309717490263
  (0, 4611)	0.15221606984307692
  (0, 2047)	0.22861928372089735
  (0, 1829)	0.21949123912081445
  (0, 3104)	0.27634309717490263
  (0, 7034)	0.27634309717490263
  (0, 3678)	0.5086995925803578
  (0, 2556)	0.19260679479915604
  (1, 3183)	0.20396619905369798
  (1, 4132)	0.20422014817562778
  (1, 1730)	0.35068386712005173
  (1, 5744)	0.43017882654303696
  (1, 7452)	0.43017882654303696
  (1, 822)	0.43017882654303696
  (1, 6801)	0.3117395963378894
  (1, 6667)	0.21342965957090995
  (1, 7320)	0.30954399063322185
  (2, 7283)	0.2625020642205925
  :	:
  (4454, 4446)	0.2249192232075804
  (4454, 6282)	0.17588463829118
  (4454, 4481)	0.18501527196494646
  (4454, 7365)	0.1854485563091247
  (4454, 6852)	0.16587020272123215
  (4455, 1540)	0.39030900564982723

8. Train model

Logistic Regression

In [17]:
model = LogisticRegression()

In [18]:
# training the Logistic Regression  model with training data
model.fit(x_train_features, y_train)

Evaluating the trained model

In [19]:
# preduction on training data
# we are giving x_train_features to our model and it is going to predict prediction_on_training_data

prediction_on_training_data = model.predict(x_train_features)

accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [20]:
print('accuracy on training data: ',accuracy_on_training_data)

accuracy on training data:  0.9690374691496523


In [21]:
# preduction on test data
# we are giving x_test_features to our model and it is going to predict prediction_on_test_data

prediction_on_test_data = model.predict(x_test_features)

accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [22]:
print('accuracy on training data: ',accuracy_on_test_data)

accuracy on training data:  0.9605381165919282


Building A Predictive System

In [23]:
input_mail = ["""Dear, Rs.1500 Welcome Bonus credited to My11circle account. IND vs PAK T20 Match. Prize Pool - Rs.2,51,00,000 (2.51CR) & Thar Click - http://1kx.in/Vioh0I """]

# convert text to feature Vector

input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)

if prediction[0] == 0:
  print("It is spam mail")
elif prediction[0] == 1:
  print("It is ham mail")
print(prediction)

It is ham mail
[1]
