In [54]:
import pandas as pd


In [5]:
df=pd.read_csv("SMSSpamCollection",names=['label','sms'],delimiter='\t')

In [6]:
df.head()


Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.drop_duplicates(subset='sms', inplace=True)

In [8]:
df.describe()

Unnamed: 0,label,sms
count,5169,5169
unique,2,5169
top,ham,Wat time r ü going to xin's hostel?
freq,4516,1


In [9]:
import pandas as pd
import numpy as np
import chardet 
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from datetime import date
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix

## splitting into test and train data

In [11]:
X = df['sms']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print("Shape of X is {}".format(X.shape))
print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
print("Shape of X_test is {} and shape of y_test is {}".format(X_test.shape, y_test.shape))

Shape of X is (5169,)
Shape of X_train is (3876,) and shape of y_train is (3876,)
Shape of X_test is (1293,) and shape of y_test is (1293,)


# Multilayer perceptron

MLP is a kind of generalization of logistic regression, where between the input and output layer, we have additional layers of neurons, called hidden layers. Each layer transforms from previous layer by using a nonlinear activation(We will use relu) on the weighted linear sum. It then trains itself by backpropagation. In the following implementation, we shall use a 3 layer hidden network with 20 neurons in each layer.

In [33]:
vectorizer=TfidfVectorizer(max_features=3900)
vectorizer.fit(X_train)
X_train_MLP=vectorizer.transform(list(X_train))
vectorizer.fit(X_test)
X_test_MLP=vectorizer.transform(list(X_test))
model=MLPClassifier(hidden_layer_sizes=(20,20,20),random_state=42)
model.fit(X_train_MLP,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 20, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

### Cross validation scores

In [34]:
cv_scores = cross_val_score(model, X=X_train_MLP, y=y_train, cv=5)
print(cv_scores)

[0.98969072 0.99097938 0.98322581 0.98580645 0.98191214]


### Classification Report and Confusion Matrix

In [35]:
pred=model.predict(X_test_MLP)
print(classification_report(y_test,pred))
pd.DataFrame(confusion_matrix(pred, y_test),
      index=['true ham', 'true spam'], 
             columns=['pred ham', 'pred spam'])

             precision    recall  f1-score   support

        ham       0.87      0.97      0.92      1124
       spam       0.19      0.05      0.08       169

avg / total       0.78      0.85      0.81      1293



Unnamed: 0,pred ham,pred spam
true ham,1089,161
true spam,35,8


# Multinomial Naive Baysian Classifier

Naive bayes is the algorithm which trains data based on Bayes theorem. It works by inverting conditional probabilities so that the query can be exprsessed as a function of measurable quantities.Multinomial naive Bayes assumes to have feature vector where each element represents the number of times it appears (or, very often, its frequency). Since we use tfidf, a measure of relative frequency, we choose to use MultinomialNB instead of BernoulliNB. The alpha parameter(Laplace smoothing parameter) is set to 50.0.

In [37]:
from sklearn.naive_bayes import MultinomialNB


In [55]:
pipeline_NB = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', MultinomialNB(alpha=50))])
pipeline_NB.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...       vocabulary=None)), ('classifier', MultinomialNB(alpha=50, class_prior=None, fit_prior=True))])

### Cross Validation Scores

In [56]:
print("The cross validation scores are: {}".format(cross_val_score(pipeline_NB, X=X_train, y=y_train, cv=5)))

The cross validation scores are: [0.875      0.875      0.87483871 0.87483871 0.87596899]


### Classfication scores and confusion matrix

In [43]:
pred=pipeline_NB.predict(X_test)
print(classification_report(y_test,pred))
pd.DataFrame(confusion_matrix(pred, y_test),
      index=['true ham', 'true spam'], 
             columns=['pred ham', 'pred spam'])

             precision    recall  f1-score   support

        ham       0.95      1.00      0.98      1124
       spam       1.00      0.69      0.81       169

avg / total       0.96      0.96      0.96      1293



Unnamed: 0,pred ham,pred spam
true ham,1124,53
true spam,0,116


# Support Vector Machine

Support vector machine is a dicriminative classifier which when given labelled training data, produces an optimal hyperplane which categorizes the data. In the following implementation,we use guassian kernel,(also called rbf, radial basis function). Kernel is a function that transforms data space so that optimal hyperplane can be found even for data which has non linear separator. We also tune the regularization parameter C to 500 so that optimizer will choose smaller margin hyperplane and will not misclassify much points.

In [66]:
from sklearn.svm import SVC
pipeline_SVM = Pipeline([('vectorizer', TfidfVectorizer()),
                     ('classifier', SVC(C=500))])
pipeline_SVM.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

### Cross Validation Scores

In [67]:
print("The cross validation scores are: {}".format(cross_val_score(pipeline_SVM, X=X_train, y=y_train, cv=5)))

The cross validation scores are: [0.94845361 0.95489691 0.9483871  0.95483871 0.95090439]


### Classification Scores and Confusion Matrix

In [68]:
pred=pipeline_SVM.predict(X_test)
print(classification_report(y_test,pred))
pd.DataFrame(confusion_matrix(pred, y_test),
      index=['true ham', 'true spam'], 
             columns=['pred ham', 'pred spam'])

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1124
       spam       1.00      0.69      0.82       169

avg / total       0.96      0.96      0.96      1293



Unnamed: 0,pred ham,pred spam
true ham,1124,52
true spam,0,117


## Summary:
MLP gives very good cross validation scores but overfits the data, as seen from comparatively poor accuracy. MultinomialNB and SVM prove to be very good classifiers which can be seen from the confusion matrix that they classify all spam mails correctly.