In [145]:
#imported Libraries
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [147]:
# imported the csv file to the dataframe
data= pd.read_csv('mail_data.csv')

In [151]:
# the data is in the table format
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [153]:
# the condition will remove the NaN and fill with the empy space
data=data.where(pd.notnull(data),'')

In [157]:
# it will show the first five rows of the dataframe
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [159]:
# it will show the last five rows of the dataframe 
data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [161]:
# it will show the columns Non null count and the data type of the column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [163]:
# this will replace the spam with 0 and ham with 1
data.loc[data['Category']=='spam','Category']=0
data.loc[data['Category']=='ham','Category']=1

In [25]:
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [165]:
# stored the each column in the separate variable
x=data['Message']
y=data['Category']

In [29]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [31]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [167]:
# Splitting the data into train and test
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2,random_state=43)

In [109]:
X_train.shape

(4457,)

In [111]:
X_test.shape

(1115,)

In [113]:
Y_train.shape

(4457,)

In [115]:
Y_test.shape

(1115,)

In [171]:
# tranforming the data's using the TfidfVectorizer
feature=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
x_feature=feature.fit_transform(X_train)
y_feature=feature.transform(X_test)

In [173]:
# tranfroming the object datatype into integer
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [121]:
print(x_feature)

  (0, 292)	0.28615405928484194
  (0, 7015)	0.274923106867414
  (0, 435)	0.31898811463647736
  (0, 1776)	0.21553798987092807
  (0, 4006)	0.2511158401981089
  (0, 3888)	0.28615405928484194
  (0, 176)	0.379374647594024
  (0, 3175)	0.24780591872347738
  (0, 5271)	0.22773295786084194
  (0, 352)	0.274923106867414
  (0, 1148)	0.2599845446851309
  (0, 4716)	0.2186882544442195
  (0, 4422)	0.2116634662152977
  (0, 6974)	0.2374530186167799
  (1, 2189)	0.28145964392471373
  (1, 3968)	0.306864450067349
  (1, 2499)	0.302745036024923
  (1, 4301)	0.23088783765279
  (1, 4850)	0.28393009538224423
  (1, 7448)	0.35890151984333446
  (1, 3684)	0.3368230571639765
  (1, 3353)	0.302745036024923
  (1, 5375)	0.3368230571639765
  (1, 4041)	0.17652961236699605
  (1, 6648)	0.2480415002427906
  :	:
  (4453, 229)	0.3190735976175009
  (4453, 1429)	0.30421900689684545
  (4453, 1066)	0.26397032561082495
  (4453, 8)	0.2731774630430118
  (4453, 5423)	0.22654118352719554
  (4453, 18)	0.2731774630430118
  (4453, 1388)	0.232

In [123]:
print(y_feature)

  (0, 6650)	0.5046697840588168
  (0, 4128)	0.2722450953061859
  (0, 4033)	0.38445736393481894
  (0, 3171)	0.2733274473891906
  (0, 2316)	0.4536098960742917
  (0, 2259)	0.4928627812517625
  (1, 4888)	0.4142965876997952
  (1, 4357)	0.5262004213931601
  (1, 3849)	0.4488306250619716
  (1, 3758)	0.24755366105890117
  (1, 2388)	0.4394371357358627
  (1, 2351)	0.3092486258418011
  (2, 6693)	0.22253274396566178
  (2, 6655)	0.1831481423290393
  (2, 6639)	0.364645818085972
  (2, 6471)	0.23689217358800238
  (2, 6115)	0.2766920712718253
  (2, 5010)	0.24178949370306393
  (2, 4747)	0.2766920712718253
  (2, 4641)	0.21976841903859667
  (2, 4106)	0.23264992194289075
  (2, 4032)	0.40707928932863835
  (2, 3293)	0.2766920712718253
  (2, 3176)	0.1750712012397122
  (2, 2340)	0.16154786448145356
  :	:
  (1110, 4774)	0.23677704578291991
  (1110, 4303)	0.33934539748587356
  (1110, 4086)	0.2754030935201283
  (1110, 4041)	0.24151003528872828
  (1110, 3995)	0.249571718549411
  (1110, 3372)	0.26548551234873113
  (1

In [175]:
# using the logistic regression
model=LogisticRegression()

In [177]:
# fitting the  training data's into the model
model.fit(x_feature,Y_train)

In [129]:
# finding the test data result and meauring the accuracy of the model
prediction=model.predict(y_feature)
acc=accuracy_score(Y_test,prediction)
print(acc)

0.9614349775784753


The model accuracy is 96.14%

In [182]:
# given spam mail mannualy to check 
mail=["Dear Valued Customer,Congratulations! You have been selected to receive a $1,000 Gift Card as a token of our appreciation. This exclusive offer is available for the next 24 hours only!Click the link below to claim your reward:Claim My Gift Card NowDon’t miss out on this limited-time opportunity to shop for free at your favorite stores!Act Fast! This offer expires soon.Sincerely,The Rewards Team " ]
mail_feature=feature.transform(mail)
predict=model.predict(mail_feature)
if predict==0:
    print("It is a Spam Mail")
else:
    print("It is a Ham Mail")

It is a Spam Mail


In [184]:
# given ham mail mannualy to check
mail=["Hi Team,Just a reminder about our Project Update meeting:Date: Dec 6th, 10:00 AM Location: Conf Room B / Zoom See you there! Best, Alex " ]
mail_feature=feature.transform(mail)
predict=model.predict(mail_feature)
if predict==0:
    print("It is a Spam Mail")
else:
    print("It is a Ham Mail")

It is a Ham Mail
