Including the Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

Data Pre Processing

In [4]:
# Loading the data into a pandas DataFrame
unprocessed_mail_data = pd.read_csv('/content/mail_data.csv')

In [5]:
# Replacing Null values with empty string
mail_data = unprocessed_mail_data.where((pd.notnull(unprocessed_mail_data)), '')

In [6]:
# The First 5 Rows of my DataFrame
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# Getting the size of the dataset in the DataFrame
mail_data.shape

(5572, 2)

Label Encoding i.e. ham = 1, spam = 0

In [9]:
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

In [10]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


Seperating Data into features and labels

In [11]:
X = mail_data['Message']

Y = mail_data['Category']

In [12]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [13]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


Doing Train Test Split

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)

In [15]:
print(X_train.shape)

(4457,)


Extracting the Features (Converting the Message text)

In [17]:
# transform the text data to feature vectors that we can use as an input to our Logistic Regression model
# basically if a text (like 'Get FREE Access' or some spam text) is repeated more times it gets some score which denotes its importance
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)

X_test_features = feature_extraction.transform(X_test)

# converting Y_train and Y_test to integers

Y_train = Y_train.astype('int')

Y_test = Y_test.astype('int')

In [18]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

Training Our Logistic Regression Model

In [19]:
model = LogisticRegression()

In [20]:
# training the model with our data
model.fit(X_train_features, Y_train)

LogisticRegression()

Evaluating the Model (Accuracy Score)

In [25]:
# Prediction on the training data

prediction_on_training_data = model.predict(X_train_features)

# Comparing Values Predcted by the model and actual values

accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

print("The Accuracy Of the Predictions On Training Data is", accuracy_on_training_data)

The Accuracy Of the Predictions On Training Data is 0.9670181736594121


In [26]:
# Prediction on the testing data

prediction_on_testing_data = model.predict(X_test_features)

# Comparing Values Predcted by the model and actual values

accuracy_on_testing_data = accuracy_score(Y_test, prediction_on_testing_data)

print("The Accuracy Of the Predictions On Testing Data is", accuracy_on_testing_data)

The Accuracy Of the Predictions On Testing Data is 0.9659192825112107


Predicting System from the Model

In [31]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times."]

# Converting input text to a feature Vector

input_data_feature = feature_extraction.transform(input_mail);

prediction = model.predict(input_data_feature)
 
if(prediction[0] == 1): 
  print("It's Not a Spam Mail")
else: 
  print("It's a Spam Mail")

It's Not a Spam Mail
