In [22]:
import pandas as pd


In [23]:
df=pd.read_csv("SMSSpamCollection",names=['label','sms'],delimiter='\t')

In [24]:
df.head()


Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [25]:
df.drop_duplicates(subset='sms', inplace=True)

In [26]:
df.describe()

Unnamed: 0,label,sms
count,5169,5169
unique,2,5169
top,ham,1 in cbe. 2 in chennai.
freq,4516,1


In [27]:
import pandas as pd
import numpy as np
import chardet 
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from datetime import date
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix

## splitting into test and train data

In [41]:
X = df['sms']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print("Shape of X is {}".format(X.shape))
print("Shape of X_train is {} and shape of y_train is {}".format(X_train.shape, y_train.shape))
print("Shape of X_test is {} and shape of y_test is {}".format(X_test.shape, y_test.shape))

Shape of X is (5169,)
Shape of X_train is (3876,) and shape of y_train is (3876,)
Shape of X_test is (1293,) and shape of y_test is (1293,)


# Multilayer perceptron

MLP is a kind of generalization of logistic regression, where between the input and output layer, we have additional layers of neurons, called hidden layers. Each layer transforms from previous layer by using a nonlinear activation(We will use relu) on the weighted linear sum. It then trains itself by backpropagation. In the following implementation, we shall use a 3 layer hidden network with 20 neurons in each layer.

In [42]:
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X_train=vectorizer.transform(list(X_train))
X_test=vectorizer.transform(list(X_test))
model=MLPClassifier(hidden_layer_sizes=(20,20,20),random_state=42)
model.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 20, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

### Cross validation scores

In [17]:
cv_scores = cross_val_score(model, X=X_train_MLP, y=y_train, cv=5)
print(cv_scores)

[0.99097938 0.98840206 0.97935484 0.98451613 0.98062016]


### Classification Report and Confusion Matrix

In [43]:

pred=model.predict(X_test)
print(classification_report(y_test,pred))
pd.DataFrame(confusion_matrix(pred, y_test),
      index=['true ham', 'true spam'], 
             columns=['pred ham', 'pred spam'])

             precision    recall  f1-score   support

        ham       0.98      0.99      0.99      1124
       spam       0.96      0.89      0.93       169

avg / total       0.98      0.98      0.98      1293



Unnamed: 0,pred ham,pred spam
true ham,1118,18
true spam,6,151


# Multinomial Naive Baysian Classifier

Naive bayes is the algorithm which trains data based on Bayes theorem. It works by inverting conditional probabilities so that the query can be exprsessed as a function of measurable quantities.Multinomial naive Bayes assumes to have feature vector where each element represents the number of times it appears (or, very often, its frequency). Since we use tfidf, a measure of relative frequency, we choose to use MultinomialNB instead of BernoulliNB. The alpha parameter(Laplace smoothing parameter) is set to 15

In [11]:
from sklearn.naive_bayes import MultinomialNB


In [53]:
model_NB=MultinomialNB(alpha=15)
from sklearn import feature_extraction
vectorizer= feature_extraction.text.CountVectorizer(stop_words = 'english')
X_vec=vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, random_state=1,test_size=0.33)
model_NB.fit(X_train,y_train)

MultinomialNB(alpha=15, class_prior=None, fit_prior=True)

### Cross Validation Scores

In [54]:
print("The cross validation scores are: {}".format(cross_val_score(model_NB, X=X_train, y=y_train, cv=5)))

The cross validation scores are: [0.97113997 0.95670996 0.97113997 0.96531792 0.96242775]


### Classfication scores and confusion matrix

In [55]:
pred=model_NB.predict(X_test)
print(classification_report(y_test,pred))
pd.DataFrame(confusion_matrix(pred, y_test),
      index=['true ham', 'true spam'], 
             columns=['pred ham', 'pred spam'])

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1501
       spam       0.99      0.73      0.84       205

avg / total       0.97      0.97      0.96      1706



Unnamed: 0,pred ham,pred spam
true ham,1500,56
true spam,1,149


# Support Vector Machine

Support vector machine is a dicriminative classifier which when given labelled training data, produces an optimal hyperplane which categorizes the data. In the following implementation,we use guassian kernel,(also called rbf, radial basis function). Kernel is a function that transforms data space so that optimal hyperplane can be found even for data which has non linear separator. We also tune the regularization parameter C to 500 so that optimizer will choose smaller margin hyperplane and will not misclassify much points.

In [56]:
from sklearn.svm import SVC


model_svm=SVC(C=500)
model_svm.fit(X_train,y_train)

SVC(C=500, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### Cross Validation Scores

In [57]:
print("The cross validation scores are: {}".format(cross_val_score(model_svm, X=X_train, y=y_train, cv=5)))

The cross validation scores are: [0.97402597 0.96681097 0.98556999 0.98265896 0.97398844]


### Classification Scores and Confusion Matrix

In [58]:
pred=model_svm.predict(X_test)
print(classification_report(y_test,pred))
pd.DataFrame(confusion_matrix(pred, y_test),
      index=['true ham', 'true spam'], 
             columns=['pred ham', 'pred spam'])

             precision    recall  f1-score   support

        ham       0.98      1.00      0.99      1501
       spam       1.00      0.85      0.92       205

avg / total       0.98      0.98      0.98      1706



Unnamed: 0,pred ham,pred spam
true ham,1501,31
true spam,0,174
