Importing the Dependencies

In [60]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [61]:
# loading the data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('mail_data.csv')

In [62]:
print(raw_mail_data)

     Category                                            Message Unnamed: 2  \
0         ham  Go until jurong point, crazy.. Available only ...        NaN   
1         ham                      Ok lar... Joking wif u oni...        NaN   
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3         ham  U dun say so early hor... U c already then say...        NaN   
4         ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...       ...                                                ...        ...   
8626      ham  url supplieda method could allow scientist pro...        NaN   
8627      ham  url blair broker whitehal deal trust borrow pr...        NaN   
8628      ham  url suppliedth new softwar switch dummi code c...        NaN   
8629      ham    url govern admit million may work beyond number        NaN   
8630      ham  url could jam intercept satellit thrown cours ...        NaN   

     Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Un

In [63]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [64]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,Category,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,ham,"Go until jurong point, crazy.. Available only ...",,,,,,,,,,,,,,,
1,ham,Ok lar... Joking wif u oni...,,,,,,,,,,,,,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,,,,,,,,,,,,
3,ham,U dun say so early hor... U c already then say...,,,,,,,,,,,,,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,,,,,,,,,,,,


In [65]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(8631, 17)

Label Encoding

In [66]:
# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

spam  -  0

ham  -  1

In [67]:
# separating the data as texts and label

X = mail_data['Message']

Y = mail_data['Category']

In [68]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
8626    url supplieda method could allow scientist pro...
8627    url blair broker whitehal deal trust borrow pr...
8628    url suppliedth new softwar switch dummi code c...
8629      url govern admit million may work beyond number
8630    url could jam intercept satellit thrown cours ...
Name: Message, Length: 8631, dtype: object


In [69]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
8626    1
8627    1
8628    1
8629    1
8630    1
Name: Category, Length: 8631, dtype: object


Splitting the data into training data & test data

In [70]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [71]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(8631,)
(6904,)
(1727,)


Feature Extraction

In [72]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=1)
# Assuming X_train is the training data

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')




In [73]:
print(X_train)

3186    MAYBE IF YOU WOKE UP BEFORE FUCKING 3 THIS WOU...
8425    url onlin observer' traci mcveigh expert tim b...
8459    url record giant' move highlight chang situat ...
2472                sry can't talk on phone, with parents
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
7161    number pm number number justin mason wrotejust...
2707    S now only i took tablets . Reaction morning o...
6400    press releas ayn rand alton parkway suit ca nu...
1688    Free Top ringtone -sub to weekly ringtone-get ...
5994    tri research via sa mirror search engin exist ...
Name: Message, Length: 6904, dtype: object


In [74]:
print(X_train_features)

  (0, 25313)	0.26327603035790054
  (0, 36534)	0.506390622726645
  (0, 12500)	0.4771159261678671
  (0, 36337)	0.506390622726645
  (0, 19364)	0.4360999876920493
  (1, 25838)	0.1625251039958751
  (1, 24898)	0.15736207935444846
  (1, 26350)	0.1848048942916818
  (1, 13339)	0.26825160202465986
  (1, 8934)	0.1866624023575617
  (1, 22115)	0.19625464609863782
  (1, 32533)	0.20408669708318788
  (1, 14665)	0.30681147707561296
  (1, 2268)	0.30681147707561296
  (1, 5008)	0.321042779662922
  (1, 32593)	0.20949718722103688
  (1, 10865)	0.20166513623648682
  (1, 19439)	0.321042779662922
  (1, 33322)	0.321042779662922
  (1, 22363)	0.321042779662922
  (1, 22892)	0.1663726150271857
  (1, 34335)	0.12215850781129976
  (2, 11103)	0.49041684198067687
  (2, 19204)	0.2655940208521174
  (2, 6562)	0.34371014022642665
  :	:
  (6903, 25097)	0.1253794070969458
  (6903, 16301)	0.16117275978043222
  (6903, 10793)	0.1271323483579491
  (6903, 22742)	0.1345764394463162
  (6903, 23222)	0.1345764394463162
  (6903, 28481)	

Training the Model

Logistic Regression

In [75]:
model = LogisticRegression()

In [76]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

Evaluating the trained model

In [77]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [78]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9649478563151797


In [79]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [80]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9554140127388535


Building a Predictive System

In [81]:
input_mail = ["07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail


In [85]:
# Save the trained model
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Save the feature extraction settings
with open('feature_extraction.pkl', 'wb') as file:
    pickle.dump(feature_extraction, file)
    
    

