# Importing all the libraries

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.model_selection import GridSearchCV

# Loading data

In [11]:
data=pd.read_csv("spam.csv",encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Cleaning data

In [12]:
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"class", "v2":"text"})
data.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
data['length']=data['text'].apply(len)
data.head()

Unnamed: 0,class,text,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


# Data pre-processing

In [14]:
def pre_process(text):
    #lowercase,stemming,stop words removal
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    words = ""
    for i in text:
        ps=PorterStemmer()
        words+=(ps.stem(i))+" "
        
    return words

In [15]:
textFeatures = data['text'].copy()
textFeatures = textFeatures.apply(pre_process)
vectorizer = TfidfVectorizer("english")
features = vectorizer.fit_transform(textFeatures)

features_train, features_test, labels_train, labels_test = train_test_split(features, data['class'], test_size=0.3, random_state=111)


# Model fitting and prediction

In [16]:
svc=SVC()
svc.fit(features_train,labels_train)
parameters={'C':[0.1,1,10,100,1000],'gamma':[1,0.1,0.01,0.001,0.0001]}
grid=GridSearchCV(SVC(),parameters,verbose=3)
grid.fit(features_train,labels_train)
print(grid.best_params_)
print(grid.best_estimator)
grid_predictions = grid.predict(labels_test)
SVM_predictions = svc.predict(labels_test)
print(confusion_matrix(labels_test,SVM_predictions))
print(confusion_matrix(labels_test,grid_predictions))


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ......... C=0.1, gamma=1, score=0.8677940046118371, total=   0.8s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] ......... C=0.1, gamma=1, score=0.8676923076923077, total=   0.8s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.8s remaining:    0.0s


[CV] ......... C=0.1, gamma=1, score=0.8683602771362586, total=   0.9s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.8677940046118371, total=   0.5s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.8676923076923077, total=   0.4s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.8683602771362586, total=   0.5s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.8677940046118371, total=   0.3s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.8676923076923077, total=   0.4s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.8683602771362586, total=   0.3s
[CV] C=0.1, gamma=0.001 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  1.2min finished


{'C': 100, 'gamma': 0.01}


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator'

# Evaluation

In [17]:

print("Detailed classification report:")
print()
print('The model is trained on full development set')
print('The scores are evaluated on full evaluation set')
print()
y_true,y_pred=labels_test,svc.predict(features_test)
print(classification_report(y_true,y_pred))
print()

Detailed classification report:

The model is trained on full development set
The scores are evaluated on full evaluation set

              precision    recall  f1-score   support

         ham       0.86      1.00      0.93      1440
        spam       0.00      0.00      0.00       232

   micro avg       0.86      0.86      0.86      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.74      0.86      0.80      1672




  'precision', 'predicted', average, warn_for)


# Testing aaccuracy of other models

In [28]:
#other classifiers
classifiers=[MultinomialNB(),
                GaussianNB(),
                BernoulliNB(),
                LinearSVC()]

for clf in classifiers:
    clf_name=clf.__class__.__name__
    clf.fit(features_train.toarray(),labels_train)
    y_pred=clf.predict(features_test.toarray())
    print(accuracy_score(labels_test,y_pred))

0.9599282296650717
0.8863636363636364
0.972488038277512
0.9820574162679426
