In [1]:
import pandas as pd   #create dataframes(data normalization, data filling, data visualization)
import numpy as np    #working with array , linear algebra fourier transform
from sklearn.model_selection import train_test_split #splitting the data into test and train data 
from sklearn.feature_extraction.text import TfidfVectorizer  #converting text data into numerical values
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix


In [2]:
raw_mail_data=pd.read_csv('mail_data.csv')  #loading data from csv file to pandas dataframe
type(raw_mail_data)

pandas.core.frame.DataFrame

In [3]:
print(raw_mail_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [4]:
mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),'') #replace null values with null string
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
mail_data.shape

(5572, 2)

In [6]:
raw_mail_data.shape

(5572, 2)

In [7]:
mail_data.loc[mail_data['Category']=='spam','Category',]=0   #label encoding spam=0,ham=1
mail_data.loc[mail_data['Category']=='ham','Category',]=1

In [8]:
X=mail_data['Message']  #saperating the data as texts(X) and label(Y)
Y=mail_data['Category'] 

In [9]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [10]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [11]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=3)

In [12]:
print(X_train)

1455    Can ü all decide faster cos my sis going home ...
3460    Not heard from U4 a while. Call me now am here...
2493    No drama Pls.i have had enough from you and fa...
3378    Yup. Wun believe wat? U really neva c e msg i ...
3826    Hi. I'm always online on yahoo and would like ...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 3900, dtype: object


In [13]:
print(X_test)

2632    URGENT! Your mobile No 077xxx WON a £2,000 Bon...
454     Ok i will tell her to stay out. Yeah its been ...
983     Congrats! 2 mobile 3G Videophones R yours. cal...
1282        Am I the only one who doesn't stalk profiles?
4610                               Y de asking like this.
                              ...                        
5017        Hey gals...U all wanna meet 4 dinner at nìte?
4540    Party's at my place at usf, no charge (but if ...
105          Umma my life and vava umma love you lot dear
881     Reminder: You have not downloaded the content ...
3995    I love to cuddle! I want to hold you in my str...
Name: Message, Length: 1672, dtype: object


In [14]:
print(X_test.shape)

(1672,)


In [15]:
print(X_train.shape)

(3900,)


In [16]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [17]:
print(X.shape)

(5572,)


Feature Extraction 
(text data to feature vectors that can be used as input to the logistic regression)

In [18]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True') #min_df minimum score given to every word , stop_words are the words that are not important for us (eg is,the,did,are,was...)

X_train_features=feature_extraction.fit_transform(X_train)  # fit the test data in TfidVectorizer and then transform it into numerical values
X_test_features=feature_extraction.transform(X_test)

Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [19]:
print(X_train_features)

  (0, 3651)	0.3587221851838313
  (0, 3086)	0.2739938546908377
  (0, 2825)	0.27449571589502497
  (0, 5509)	0.40335109063478575
  (0, 1799)	0.3212783541487352
  (0, 2489)	0.5022955611445961
  (0, 1965)	0.45020567466160955
  (1, 4242)	0.25454751074340237
  (1, 4277)	0.33272056276910283
  (1, 3821)	0.2433861866561607
  (1, 6813)	0.2600994552195357
  (1, 6)	0.33272056276910283
  (1, 6147)	0.16521313463046028
  (1, 2058)	0.18864352312402063
  (1, 3671)	0.16295245094503386
  (1, 1153)	0.31698956000242107
  (1, 3875)	0.19423188001633226
  (1, 3534)	0.33272056276910283
  (1, 3451)	0.1449316171395763
  (1, 4271)	0.19008955732757765
  (1, 6331)	0.33272056276910283
  (1, 3003)	0.2841165834640824
  (2, 5099)	0.30299688346739717
  (2, 6545)	0.19459812997829923
  (2, 4225)	0.30299688346739717
  :	:
  (3897, 2908)	0.32603177409174644
  (3897, 4271)	0.2690692327260679
  (3898, 2081)	0.3654387356575668
  (3898, 3632)	0.3140198944936123
  (3898, 6590)	0.3055286729595858
  (3898, 3587)	0.3117604480135819


In [20]:
print(X_test_features)

  (0, 6740)	0.19347233837676625
  (0, 6407)	0.2055080097399481
  (0, 4979)	0.23875109810071085
  (0, 4829)	0.20062729909686805
  (0, 4069)	0.18430828922923403
  (0, 1432)	0.2621756463911434
  (0, 1300)	0.3157499757997181
  (0, 1261)	0.25355210549045815
  (0, 1007)	0.24243442763882023
  (0, 969)	0.28300554867610606
  (0, 375)	0.23538848168685603
  (0, 284)	0.24441669340216773
  (0, 19)	0.30463229794808017
  (0, 13)	0.2692215923431076
  (0, 8)	0.28300554867610606
  (0, 1)	0.23875109810071085
  (1, 6836)	0.3036306856524676
  (1, 6235)	0.42846692136789877
  (1, 6101)	0.32740817341021416
  (1, 6022)	0.2644544987998762
  (1, 5758)	0.3624150379626633
  (1, 4381)	0.2289817544122339
  (1, 4101)	0.34902822855126225
  (1, 3229)	0.4914205959782368
  (2, 6678)	0.22376179533403717
  :	:
  (1668, 4654)	0.32065517896927925
  (1668, 4533)	0.4122308441557586
  (1668, 2851)	0.24779663110632108
  (1668, 1538)	0.4065909471787599
  (1669, 6457)	0.4196531319230013
  (1669, 6343)	0.7496300970222255
  (1669, 3

In [21]:
print(Y_train)


1455    1
3460    0
2493    1
3378    1
3826    1
       ..
789     0
968     1
1667    1
3321    1
1688    0
Name: Category, Length: 3900, dtype: int32


In [22]:
print(Y_test)

2632    0
454     1
983     0
1282    1
4610    1
       ..
5017    1
4540    1
105     1
881     0
3995    1
Name: Category, Length: 1672, dtype: int32


In [23]:
#using logistic Regression
model=LogisticRegression()
#training the model
model.fit(X_train_features,Y_train)
prediction_on_training_data=model.predict(X_train_features)
precision_on_training_data=precision_score(Y_train,prediction_on_training_data)
recall_on_training_data=recall_score(Y_train,prediction_on_training_data)
print("precision_on_training_data: " ,precision_on_training_data)
print("recall_on_training_data: " ,recall_on_training_data)
prediction_on_test_data=model.predict(X_test_features)
precision_on_test_data=precision_score(Y_test,prediction_on_test_data)
recall_on_test_data=recall_score(Y_test,prediction_on_test_data)
print("precision_on_test_data:     " ,precision_on_test_data)
print("recall_on_test_data: " ,recall_on_test_data)

confusion_matrix(Y_test,prediction_on_test_data,labels=[0,1])

precision_on_training_data:  0.9629945915172218
recall_on_training_data:  0.9994091580502216
precision_on_test_data:      0.9612558450233801
recall_on_test_data:  0.9993055555555556


array([[ 174,   58],
       [   1, 1439]], dtype=int64)

In [24]:
input_mail=["hey prashant!win win you won 500$"]
input_data_features=feature_extraction.transform(input_mail)

prediction=model.predict(input_data_features)
if prediction[0]==0:
    print("Spam mail")
else:
    print("Ham mail")

Spam mail


In [25]:
#uisng KNN
model=KNeighborsClassifier()
model.fit(X_train_features,Y_train)
prediction_on_training_data=model.predict(X_train_features)
precision_on_training_data=precision_score(Y_train,prediction_on_training_data)
recall_on_training_data=recall_score(Y_train,prediction_on_training_data)
print("precision_on_training_data: " ,precision_on_training_data)
print("recall_on_training_data: " ,recall_on_training_data)
prediction_on_test_data=model.predict(X_test_features)
precision_on_test_data=precision_score(Y_test,prediction_on_test_data)
recall_on_test_data=recall_score(Y_test,prediction_on_test_data)
print("precision_on_test_data:     " ,precision_on_test_data)
print("recall_on_test_data: " ,recall_on_test_data)

confusion_matrix(Y_test,prediction_on_test_data,labels=[0,1])

precision_on_training_data:  0.9128675478823847
recall_on_training_data:  0.9997045790251108
precision_on_test_data:      0.897196261682243
recall_on_test_data:  1.0


array([[  67,  165],
       [   0, 1440]], dtype=int64)

In [35]:
#using naive bayes
model=GaussianNB()
model.fit(X_train_features.toarray(),Y_train)
prediction_on_training_data=model.predict(X_train_features.toarray())
precision_on_training_data=precision_score(Y_train,prediction_on_training_data)
recall_on_training_data=recall_score(Y_train,prediction_on_training_data)
print("precision_on_training_data: " ,precision_on_training_data)
print("recall_on_training_data: " ,recall_on_training_data)
prediction_on_test_data=model.predict(X_test_features.toarray())
precision_on_test_data=precision_score(Y_test,prediction_on_test_data)
recall_on_test_data=recall_score(Y_test,prediction_on_test_data)
print("precision_on_test_data:     " ,precision_on_test_data)
print("recall_on_test_data: " ,recall_on_test_data)

confusion_matrix(Y_test,prediction_on_test_data,labels=[0,1])

precision_on_training_data:  1.0
recall_on_training_data:  0.9285081240768095
precision_on_test_data:      0.9793577981651376
recall_on_test_data:  0.8895833333333333


array([[ 205,   27],
       [ 159, 1281]], dtype=int64)

In [26]:
input_mail=["hey prashant!win win you won 500$"]
input_data_features=feature_extraction.transform(input_mail)

prediction=model.predict(input_data_features)
if prediction[0]==0:
    print("Spam mail")
else:
    print("Ham mail")

Ham mail


In [27]:
#using SVM
model=SVC()
model.fit(X_train_features,Y_train)
prediction_on_training_data=model.predict(X_train_features)
precision_on_training_data=precision_score(Y_train,prediction_on_training_data)
recall_on_training_data=recall_score(Y_train,prediction_on_training_data)
print("precision_on_training_data: " ,precision_on_training_data)
print("recall_on_training_data: " ,recall_on_training_data)
prediction_on_test_data=model.predict(X_test_features)
precision_on_test_data=precision_score(Y_test,prediction_on_test_data)
recall_on_test_data=recall_score(Y_test,prediction_on_test_data)
print("precision_on_test_data:     " ,precision_on_test_data)
print("recall_on_test_data: " ,recall_on_test_data)

confusion_matrix(Y_test,prediction_on_test_data,labels=[0,1])

precision_on_training_data:  0.9979363207547169
recall_on_training_data:  1.0
precision_on_test_data:      0.9749492213947191
recall_on_test_data:  1.0


array([[ 195,   37],
       [   0, 1440]], dtype=int64)

In [28]:
input_mail=["hey prashant!win win you won 500$"]
input_data_features=feature_extraction.transform(input_mail)

prediction=model.predict(input_data_features)
if prediction[0]==0:
    print("Spam mail")
else:
    print("Ham mail")


Spam mail
