# Importing dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# Loading dataset

In [117]:
raw_mail = pd.read_csv('email_classification.csv')
raw_mail.head()

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham


In [118]:
mail = raw_mail.where((pd.notnull(raw_mail)),'')

In [119]:
mail.head()

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham


# Label encoding

In [128]:
mail.loc[mail['label'] == 'spam', 'label',] = 0
mail.loc[mail['label'] == 'ham', 'label',] = 1

In [130]:
x = mail['email']
y = mail['label']

# Splitting data into train and test data

In [133]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.1,random_state=3)

In [134]:
X_test

6      Your email account storage is full. Click here...
73     Thank you for attending our workshop. Here are...
41     We appreciate your business. Here's a token of...
47     You've been selected for a free iPhone X. Clic...
176    Your free trial period is ending soon. Upgrade...
67     Congratulations! You're our lucky winner of th...
155    We're thrilled to announce our partnership wit...
112    Stay in the loop with our newsletter. Subscrib...
146    We're extending our sale for one more day due ...
25     We've extended our sale for one more day! Don'...
87     We're excited to share our latest blog post wi...
4      Your opinion matters! Take our survey and help...
23     Your PayPal account has been suspended. Click ...
3      Your Amazon account has been locked. Click her...
72     Good morning! Attached is the report you reque...
126    We've launched a new feature based on user fee...
133    You're invited to a special event. Click here ...
86     Your annual membership h

# Feature extraction

In [155]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [156]:
print(X_train_features)

  (0, 358)	0.36160559511978
  (0, 121)	0.33041881908941517
  (0, 31)	0.4149197408845378
  (0, 17)	0.38373296485417285
  (0, 120)	0.36160559511978
  (0, 207)	0.30829144935502234
  (0, 9)	0.4149197408845378
  (0, 371)	0.20662720618330413
  (1, 51)	0.36707961679613343
  (1, 213)	0.33948866686946094
  (1, 366)	0.33948866686946094
  (1, 118)	0.33948866686946094
  (1, 257)	0.36707961679613343
  (1, 375)	0.36707961679613343
  (1, 141)	0.36707961679613343
  (1, 113)	0.33948866686946094
  (2, 295)	0.44086670354344487
  (2, 55)	0.18874050894509203
  (2, 135)	0.45957772685179105
  (2, 2)	0.5446612366607229
  (2, 116)	0.5120019728974065
  (3, 150)	0.5
  (3, 176)	0.5
  (3, 180)	0.5
  (3, 313)	0.5
  :	:
  (157, 111)	0.27675474662057914
  (157, 120)	0.3526322633232568
  (158, 208)	0.4444615241117209
  (158, 351)	0.41105428738435545
  (158, 177)	0.3412433970401375
  (158, 319)	0.35394423898607325
  (158, 109)	0.3412433970401375
  (158, 303)	0.38735147571343864
  (158, 360)	0.35394423898607325
  (159, 

In [110]:
X_train

Unnamed: 0,email
6,Your email account storage is full. Click here...
73,Thank you for attending our workshop. Here are...
41,We appreciate your business. Here's a token of...
47,You've been selected for a free iPhone X. Clic...
176,Your free trial period is ending soon. Upgrade...
67,Congratulations! You're our lucky winner of th...
155,We're thrilled to announce our partnership wit...
112,Stay in the loop with our newsletter. Subscrib...
146,We're extending our sale for one more day due ...
25,We've extended our sale for one more day! Don'...


# Model training

In [159]:
model = LogisticRegression()

In [160]:
model.fit(X_train_features,y_train)

# Model predicting

In [161]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [162]:
print(accuracy_on_training_data)

0.9875776397515528


In [163]:
prediction_on_testing_data = model.predict(X_test_features)
accuracy_on_testing_data = accuracy_score(y_test, prediction_on_testing_data)

In [164]:
print(accuracy_on_testing_data)

1.0


# Building a predictive system

In [165]:
input_mail = ['Your account has been credited with loyalty points. Redeem them for exciting rewards!']
input_mail_features = feature_extraction.transform(input_mail)
prediction = model.predict(input_mail_features)

if(prediction[0]==1):
    print('Ham mail')
else:
    print('Spam mail')

Ham mail
