In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [32]:
#Loading the data
df = pd.read_csv("mail_data.csv")

#Replacing NULL values with NULL string
df1 = df.where( (pd.notnull(df) ),'')

In [33]:
df1.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [34]:
df1.shape

(5572, 2)

In [35]:
#Coding the Ham mails to False and Spam mails to True
df1.loc[df1['Category'] == 'spam', 'Category'] = 1
df1.loc[df1['Category'] == 'ham', 'Category'] = 0

In [36]:
df1.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
x = df1['Message']
y = df1['Category']

In [38]:
#Train and Test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)

In [39]:
print(x.shape, x_train.shape, x_test.shape)

(5572,) (4179,) (1393,)


In [40]:
print(y_train)

710     1
3740    0
2711    1
3155    1
3748    0
       ..
905     0
5192    0
3980    0
235     1
5157    0
Name: Category, Length: 4179, dtype: object


In [41]:
#Transforming text to readable data for Logistic Regression Model
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase='True')

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [42]:
 print(x_train_features)

  (0, 1968)	0.20555770187976505
  (0, 4621)	0.2826608319517557
  (0, 6670)	0.23567651227529574
  (0, 50)	0.2826608319517557
  (0, 4245)	0.2826608319517557
  (0, 262)	0.25802160000805496
  (0, 4815)	0.16823978217870592
  (0, 2777)	0.2987551880042128
  (0, 4819)	0.2467015577941807
  (0, 1524)	0.2134897628788885
  (0, 3760)	0.21612407138649548
  (0, 5307)	0.25008953900893155
  (0, 3849)	0.20358236606353303
  (0, 4623)	0.22048220359216783
  (0, 5024)	0.22371596808895342
  (0, 3086)	0.2065881188886313
  (0, 507)	0.2826608319517557
  (1, 6542)	0.657492938833411
  (1, 6025)	0.657492938833411
  (1, 2166)	0.3679756388246497
  (2, 6611)	0.21801015986499822
  (2, 6470)	0.35233710750013614
  (2, 5086)	0.33581174761157134
  (2, 98)	0.35233710750013614
  (2, 3177)	0.22281059031897985
  :	:
  (4176, 387)	0.317680062733604
  (4176, 3876)	0.25677970808202527
  (4176, 7131)	0.23431439791927364
  (4176, 6188)	0.25152520362673875
  (4176, 4414)	0.28918871571362903
  (4177, 6371)	0.31506538554722807
  (417

In [43]:
lr = LogisticRegression()

In [44]:
lr.fit(x_train_features, y_train)

LogisticRegression()

In [45]:
prediction_training = lr.predict(x_train_features)
accuracy_training = accuracy_score(y_train, prediction_training)

In [46]:
print("Accuracy of the model on training data = ",accuracy_training)

Accuracy of the model on training data =  0.9662598707824839


In [47]:
prediction_test = lr.predict(x_test_features)
accuracy_test = accuracy_score(y_test, prediction_test)

In [48]:
print("Accuracy of the model on test data = ",accuracy_test)

Accuracy of the model on test data =  0.968413496051687


In [52]:
mail_check = [" 07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow "]
mail_check_features = feature_extraction.transform(mail_check)

predict_mail = lr.predict(mail_check_features)

In [53]:
if (predict_mail[0] == 1):
    print("Spam mail (True)")
else:
    print("Ham Mail (False)")

Spam mail (True)
