# Task 6 

### Email Spam Filtering

#### Objective: Use a dataset of emails and build a model that can identify spam emails.

Author : Aurobindo Parida

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv(r'C:\Users\askap\Downloads\spam_ham_dataset.csv')
pd.DataFrame(df)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


Lets drop the unwanted columns

In [3]:
df.drop(["Unnamed: 0","label_num"],axis=1,inplace=True)
df.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5171 non-null   object
 1   text    5171 non-null   object
dtypes: object(2)
memory usage: 80.9+ KB


In [5]:
df.loc[df['label']=='spam','label',] = 0
df.loc[df['label']=='ham','label',] = 1

In [6]:
x = df['text']
y=df['label']

In [7]:
print(x)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object


In [8]:
print(y)

0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: label, Length: 5171, dtype: object


In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=5)

In [10]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase =True)

x_train_features = feature_extraction.fit_transform(x_train)

x_test_features = feature_extraction.transform(x_test)

y_train=y_train.astype('int')
y_test = y_test.astype('int')

In [11]:
print(x_train)

4933    Subject: fw : hot teen peeing\r\nwell well wel...
999     Subject: vicodin for you . cheap .\r\nyou need...
2390    Subject: heisse sx action\r\nhallo mein lieber...
722     Subject: so much fun with these girls . .\r\nh...
4217    Subject: central power & light\r\ndaren - was ...
                              ...                        
3046    Subject: 1 / 2000 meter 9643 three rivers issu...
1725    Subject: southern union for may\r\n2100 83 st\...
4079    Subject: sweet virgins love hard sex : )\r\nfi...
2254    Subject: fw : cell phones\r\nhere ' s the late...
2915    Subject: enron / hpl actuals for august 15 , 2...
Name: text, Length: 4136, dtype: object


In [12]:
print(x_train_features)

  (0, 14036)	0.0980778267912726
  (0, 11808)	0.06987796592054588
  (0, 12391)	0.0980778267912726
  (0, 25414)	0.09060870451789685
  (0, 41956)	0.052683318944850066
  (0, 20003)	0.10972131570143474
  (0, 24816)	0.032317827274495566
  (0, 41416)	0.07574037596188578
  (0, 17854)	0.07044282332499875
  (0, 43423)	0.06737832901503668
  (0, 18697)	0.08805874200498454
  (0, 31229)	0.10091335636144186
  (0, 7740)	0.08593595246394614
  (0, 20209)	0.1961556535825452
  (0, 21031)	0.06715341696201405
  (0, 20291)	0.08499424769502247
  (0, 38502)	0.08593595246394614
  (0, 44210)	0.06039751107994651
  (0, 19937)	0.28140662110504605
  (0, 22637)	0.10972131570143474
  (0, 4390)	0.10091335636144186
  (0, 16716)	0.09060870451789685
  (0, 32980)	0.08499424769502247
  (0, 23021)	0.10972131570143474
  (0, 41383)	0.06491114985197706
  :	:
  (4134, 24816)	0.04229407249175294
  (4134, 17854)	0.023046984648179333
  (4134, 15340)	0.029674429132047632
  (4134, 41635)	0.030443350512632018
  (4134, 39885)	0.0212371

In [13]:
model = LogisticRegression()

In [14]:
model.fit(x_train_features,y_train)

In [15]:
prediction = model.predict(x_test_features)
accuracy= accuracy_score(y_test,prediction)

In [16]:
print("Accuracy on test set: ",accuracy)

Accuracy on test set:  0.991304347826087


In [19]:
mail= input("Enter your mail: ")
print(mail)

mail_features = feature_extraction.transform([mail])

prediction = model.predict(mail_features)

print(prediction)

if(prediction[0] == 0):
    print("Spam mail")
else:
    print("Ham Mail")

Enter your mail: Hi @githubisjericho,  We noticed your personal access token (classic) "Watson to Github" with gist and repo scopes will expire in about 10 hours.  If this token is still needed, visit https://github.com/settings/tokens/1440247810/regenerate to generate an equivalent.  If you run into problems, please contact support by visiting https://github.com/contact?tags=dotcom-accounts  Thanks, The GitHub Team
Hi @githubisjericho,  We noticed your personal access token (classic) "Watson to Github" with gist and repo scopes will expire in about 10 hours.  If this token is still needed, visit https://github.com/settings/tokens/1440247810/regenerate to generate an equivalent.  If you run into problems, please contact support by visiting https://github.com/contact?tags=dotcom-accounts  Thanks, The GitHub Team
[1]
Ham Mail


In [20]:
mail= input("Enter your mail: ")
print(mail)

mail_features = feature_extraction.transform([mail])

prediction = model.predict(mail_features)

print(prediction)

if(prediction[0] == 0):
    print("Spam mail")
else:
    print("Ham Mail")

Enter your mail: 1st February  ‘The greatest literary show on Earth!’  Day 1 is rich with poetry, fiction, geopolitics, economic forecasts, sporting anecdotes and fantasy, tempered with dystopia, peppered with climate action, and spiritual quests. From the Booker to the Academy, today’s speakers are winners of diverse accolades!  Check these out!
1st February  ‘The greatest literary show on Earth!’  Day 1 is rich with poetry, fiction, geopolitics, economic forecasts, sporting anecdotes and fantasy, tempered with dystopia, peppered with climate action, and spiritual quests. From the Booker to the Academy, today’s speakers are winners of diverse accolades!  Check these out!
[0]
Spam mail
