In [None]:
#importing the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# Data Preprocessing

In [None]:
#load the dataset to pandas DataFrame
raw_mail_data = pd.read_csv('/content/spamham.csv')
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [None]:
#drop duplicates
mail_data.drop_duplicates(inplace = True)

In [None]:
mail_data.shape

(5157, 2)

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#label spam mail as 0 and non spam mail(ham) as 1
mail_data['Category'] = mail_data['Category'].map({'ham':1, 'spam':0})

In [None]:
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#seperate the data as text and label. X--> test and Y-->label
X = mail_data['Message']
Y = mail_data['Category']

In [None]:
print(X)
print('******************')
print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5157, dtype: object
******************
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5157, dtype: int64


In [None]:
#split the data as train data and test data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size = 0.67,test_size = 0.33, random_state = 2)

In [None]:
X_train

3070                    Ay wana meet on sat?ü wkg on sat?
4053                              Ya ok, then had dinner?
3330      Sac will score big hundred.he is set batsman:-)
5036    How many times i told in the stage all use to ...
2919                        Thanks chikku..:-) gud nyt:-*
                              ...                        
3540        What happen dear. Why you silent. I am tensed
1127    Not tonight mate. Catching up on some sleep. T...
2633                       I WILL CAL YOU SIR. In meeting
3839    Went to pay rent. So i had to go to the bank t...
2699                             FROM 88066 LOST £12 HELP
Name: Message, Length: 3455, dtype: object

In [None]:
Y_train

3070    1
4053    1
3330    1
5036    1
2919    1
       ..
3540    1
1127    1
2633    1
3839    1
2699    0
Name: Category, Length: 3455, dtype: int64

In [None]:
X_test

2454                         How much she payed. Suganya.
2226    Alrite jod hows the revision goin? Keris bin d...
5420                I dont know oh. Hopefully this month.
1016    Dear,regret i cudnt pick call.drove down frm c...
261                                                   Yup
                              ...                        
212                            Home so we can always chat
1505                           Oh my God. I'm almost home
3547    SO IS TH GOWER MATE WHICH IS WHERE I AM!?! HOW...
4472              Wa... U so efficient... Gee... Thanx...
728       Alright omw, gotta change my order to a half8th
Name: Message, Length: 1702, dtype: object

In [None]:
Y_test

2454    1
2226    1
5420    1
1016    1
261     1
       ..
212     1
1505    1
3547    1
4472    1
728     1
Name: Category, Length: 1702, dtype: int64

# Importing pipeline and doing feature extraction

In [None]:
from sklearn.pipeline import Pipeline
import joblib

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.85, norm='l2')),
    ('SVC', SVC()),
])

In [None]:
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.85, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('SVC',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr

In [None]:
filename = 'pipeline.sav'
joblib.dump(pipeline, filename)

['pipeline.sav']

In [None]:
model = joblib.load('pipeline.sav')

# Evaluation of the model

In [None]:
#prediction on training data
prediction_on_training_data = model.predict(X_train)
accuracy_on_training_data = accuracy_score(Y_train,prediction_on_training_data)
print('The accuracy on training data is: ',accuracy_on_training_data)

The accuracy on training data is:  0.9971056439942113


In [None]:
#prediction on test data
prediction_on_test_data = model.predict(X_test)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_test_data)
print('The accuracy on test data is: ',accuracy_on_test_data)

The accuracy on test data is:  0.9759106933019976


# Prediction on new mail

In [None]:
input_mail = '''England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/ú1.20 POBOXox36504W45WQ 16+''' 

#making prediction  
prediction = model.predict([input_mail])
print(prediction)

if(prediction[0] == 1):
   print('HAM MAIL')
else:
  print('SPAM MAIL')

[0]
SPAM MAIL
